summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c7
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/namei.c8
-rw-r--r--fs/afs/addr_list.c50
-rw-r--r--fs/afs/cell.c437
-rw-r--r--fs/afs/cmservice.c82
-rw-r--r--fs/afs/dir.c17
-rw-r--r--fs/afs/dynroot.c486
-rw-r--r--fs/afs/fs_probe.c32
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/internal.h98
-rw-r--r--fs/afs/main.c16
-rw-r--r--fs/afs/mntpt.c5
-rw-r--r--fs/afs/proc.c15
-rw-r--r--fs/afs/rxrpc.c8
-rw-r--r--fs/afs/server.c601
-rw-r--r--fs/afs/server_list.c6
-rw-r--r--fs/afs/super.c25
-rw-r--r--fs/afs/vl_alias.c7
-rw-r--r--fs/afs/vl_rotate.c2
-rw-r--r--fs/afs/volume.c15
-rw-r--r--fs/autofs/autofs_i.h2
-rw-r--r--fs/autofs/dev-ioctl.c3
-rw-r--r--fs/autofs/root.c14
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/bcachefs/Kconfig2
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c190
-rw-r--r--fs/bcachefs/alloc_background.h2
-rw-r--r--fs/bcachefs/alloc_foreground.c31
-rw-r--r--fs/bcachefs/alloc_foreground.h19
-rw-r--r--fs/bcachefs/alloc_types.h2
-rw-r--r--fs/bcachefs/backpointers.c151
-rw-r--r--fs/bcachefs/backpointers.h26
-rw-r--r--fs/bcachefs/bcachefs.h20
-rw-r--r--fs/bcachefs/bcachefs_format.h16
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h29
-rw-r--r--fs/bcachefs/btree_cache.c1
-rw-r--r--fs/bcachefs/btree_gc.c18
-rw-r--r--fs/bcachefs/btree_io.c259
-rw-r--r--fs/bcachefs/btree_io.h4
-rw-r--r--fs/bcachefs/btree_iter.c14
-rw-r--r--fs/bcachefs/btree_iter.h9
-rw-r--r--fs/bcachefs/btree_locking.c8
-rw-r--r--fs/bcachefs/btree_node_scan.c29
-rw-r--r--fs/bcachefs/btree_trans_commit.c120
-rw-r--r--fs/bcachefs/btree_types.h13
-rw-r--r--fs/bcachefs/btree_update.c5
-rw-r--r--fs/bcachefs/btree_update.h2
-rw-r--r--fs/bcachefs/btree_update_interior.c150
-rw-r--r--fs/bcachefs/btree_update_interior.h7
-rw-r--r--fs/bcachefs/buckets.c80
-rw-r--r--fs/bcachefs/buckets.h31
-rw-r--r--fs/bcachefs/buckets_types.h27
-rw-r--r--fs/bcachefs/chardev.c38
-rw-r--r--fs/bcachefs/checksum.c25
-rw-r--r--fs/bcachefs/checksum.h2
-rw-r--r--fs/bcachefs/compress.c65
-rw-r--r--fs/bcachefs/data_update.c237
-rw-r--r--fs/bcachefs/data_update.h17
-rw-r--r--fs/bcachefs/debug.c34
-rw-r--r--fs/bcachefs/dirent.c274
-rw-r--r--fs/bcachefs/dirent.h17
-rw-r--r--fs/bcachefs/dirent_format.h20
-rw-r--r--fs/bcachefs/disk_accounting.h18
-rw-r--r--fs/bcachefs/disk_accounting_format.h12
-rw-r--r--fs/bcachefs/ec.c482
-rw-r--r--fs/bcachefs/ec.h46
-rw-r--r--fs/bcachefs/ec_types.h12
-rw-r--r--fs/bcachefs/errcode.h65
-rw-r--r--fs/bcachefs/error.c88
-rw-r--r--fs/bcachefs/error.h57
-rw-r--r--fs/bcachefs/extents.c249
-rw-r--r--fs/bcachefs/extents.h24
-rw-r--r--fs/bcachefs/extents_format.h24
-rw-r--r--fs/bcachefs/extents_types.h11
-rw-r--r--fs/bcachefs/eytzinger.c76
-rw-r--r--fs/bcachefs/eytzinger.h95
-rw-r--r--fs/bcachefs/fs-io-buffered.c38
-rw-r--r--fs/bcachefs/fs-io-direct.c20
-rw-r--r--fs/bcachefs/fs-ioctl.c34
-rw-r--r--fs/bcachefs/fs-ioctl.h20
-rw-r--r--fs/bcachefs/fs.c147
-rw-r--r--fs/bcachefs/fsck.c231
-rw-r--r--fs/bcachefs/inode.c24
-rw-r--r--fs/bcachefs/inode.h1
-rw-r--r--fs/bcachefs/inode_format.h3
-rw-r--r--fs/bcachefs/io_misc.c3
-rw-r--r--fs/bcachefs/io_read.c747
-rw-r--r--fs/bcachefs/io_read.h92
-rw-r--r--fs/bcachefs/io_write.c414
-rw-r--r--fs/bcachefs/io_write.h38
-rw-r--r--fs/bcachefs/io_write_types.h2
-rw-r--r--fs/bcachefs/journal.c191
-rw-r--r--fs/bcachefs/journal.h42
-rw-r--r--fs/bcachefs/journal_io.c99
-rw-r--r--fs/bcachefs/journal_reclaim.c10
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c7
-rw-r--r--fs/bcachefs/journal_types.h37
-rw-r--r--fs/bcachefs/lru.c100
-rw-r--r--fs/bcachefs/lru.h22
-rw-r--r--fs/bcachefs/lru_format.h6
-rw-r--r--fs/bcachefs/migrate.c26
-rw-r--r--fs/bcachefs/move.c456
-rw-r--r--fs/bcachefs/move_types.h20
-rw-r--r--fs/bcachefs/movinggc.c15
-rw-r--r--fs/bcachefs/namei.c (renamed from fs/bcachefs/fs-common.c)210
-rw-r--r--fs/bcachefs/namei.h (renamed from fs/bcachefs/fs-common.h)31
-rw-r--r--fs/bcachefs/opts.c115
-rw-r--r--fs/bcachefs/opts.h69
-rw-r--r--fs/bcachefs/progress.c63
-rw-r--r--fs/bcachefs/progress.h29
-rw-r--r--fs/bcachefs/rebalance.c46
-rw-r--r--fs/bcachefs/recovery.c4
-rw-r--r--fs/bcachefs/recovery_passes_types.h2
-rw-r--r--fs/bcachefs/reflink.c23
-rw-r--r--fs/bcachefs/sb-counters.c90
-rw-r--r--fs/bcachefs/sb-counters.h4
-rw-r--r--fs/bcachefs/sb-counters_format.h31
-rw-r--r--fs/bcachefs/sb-downgrade.c8
-rw-r--r--fs/bcachefs/sb-errors_format.h5
-rw-r--r--fs/bcachefs/sb-members.h16
-rw-r--r--fs/bcachefs/sb-members_format.h1
-rw-r--r--fs/bcachefs/snapshot.c7
-rw-r--r--fs/bcachefs/snapshot.h1
-rw-r--r--fs/bcachefs/str_hash.c2
-rw-r--r--fs/bcachefs/str_hash.h12
-rw-r--r--fs/bcachefs/super-io.c92
-rw-r--r--fs/bcachefs/super-io.h10
-rw-r--r--fs/bcachefs/super.c141
-rw-r--r--fs/bcachefs/super.h2
-rw-r--r--fs/bcachefs/super_types.h8
-rw-r--r--fs/bcachefs/sysfs.c141
-rw-r--r--fs/bcachefs/sysfs.h5
-rw-r--r--fs/bcachefs/trace.h101
-rw-r--r--fs/bcachefs/util.c231
-rw-r--r--fs/bcachefs/util.h16
-rw-r--r--fs/bcachefs/xattr.c2
-rw-r--r--fs/binfmt_elf.c21
-rw-r--r--fs/binfmt_elf_fdpic.c13
-rw-r--r--fs/btrfs/accessors.h1
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/async-thread.c11
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/bio.c38
-rw-r--r--fs/btrfs/block-group.c155
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c31
-rw-r--r--fs/btrfs/compression.h26
-rw-r--r--fs/btrfs/ctree.c18
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/defrag.c78
-rw-r--r--fs/btrfs/defrag.h4
-rw-r--r--fs/btrfs/delayed-inode.c101
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c33
-rw-r--r--fs/btrfs/dir-item.c24
-rw-r--r--fs/btrfs/dir-item.h1
-rw-r--r--fs/btrfs/direct-io.c19
-rw-r--r--fs/btrfs/direct-io.h2
-rw-r--r--fs/btrfs/discard.c34
-rw-r--r--fs/btrfs/discard.h1
-rw-r--r--fs/btrfs/disk-io.c109
-rw-r--r--fs/btrfs/export.c51
-rw-r--r--fs/btrfs/extent-io-tree.c8
-rw-r--r--fs/btrfs/extent-tree.c63
-rw-r--r--fs/btrfs/extent-tree.h1
-rw-r--r--fs/btrfs/extent_io.c589
-rw-r--r--fs/btrfs/extent_io.h9
-rw-r--r--fs/btrfs/file-item.c30
-rw-r--r--fs/btrfs/file-item.h2
-rw-r--r--fs/btrfs/file.c28
-rw-r--r--fs/btrfs/file.h2
-rw-r--r--fs/btrfs/free-space-cache.c57
-rw-r--r--fs/btrfs/free-space-tree.c45
-rw-r--r--fs/btrfs/fs.c1
-rw-r--r--fs/btrfs/fs.h26
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode.c597
-rw-r--r--fs/btrfs/ioctl.c217
-rw-r--r--fs/btrfs/ioctl.h4
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c23
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/props.c66
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid-stripe-tree.h1
-rw-r--r--fs/btrfs/reflink.c100
-rw-r--r--fs/btrfs/relocation.c30
-rw-r--r--fs/btrfs/scrub.c4
-rw-r--r--fs/btrfs/send.c544
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c2
-rw-r--r--fs/btrfs/subpage.c224
-rw-r--r--fs/btrfs/subpage.h52
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/sysfs.c14
-rw-r--r--fs/btrfs/sysfs.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c6
-rw-r--r--fs/btrfs/tests/extent-map-tests.c1
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c392
-rw-r--r--fs/btrfs/verity.c4
-rw-r--r--fs/btrfs/volumes.c16
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c85
-rw-r--r--fs/btrfs/zoned.c9
-rw-r--r--fs/btrfs/zstd.c66
-rw-r--r--fs/buffer.c58
-rw-r--r--fs/cachefiles/namei.c16
-rw-r--r--fs/cachefiles/ondemand.c7
-rw-r--r--fs/ceph/addr.c1259
-rw-r--r--fs/ceph/dir.c45
-rw-r--r--fs/ceph/inode.c31
-rw-r--r--fs/ceph/mds_client.c2
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/coda/dir.c14
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/coredump.c42
-rw-r--r--fs/crypto/Kconfig21
-rw-r--r--fs/crypto/crypto.c22
-rw-r--r--fs/crypto/hkdf.c85
-rw-r--r--fs/crypto/inline_crypt.c4
-rw-r--r--fs/dax.c111
-rw-r--r--fs/dcache.c70
-rw-r--r--fs/devpts/inode.c251
-rw-r--r--fs/dlm/config.h2
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/drop_caches.c23
-rw-r--r--fs/ecryptfs/inode.c20
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/erofs/Kconfig14
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c148
-rw-r--r--fs/erofs/decompressor.c95
-rw-r--r--fs/erofs/decompressor_deflate.c8
-rw-r--r--fs/erofs/decompressor_lzma.c8
-rw-r--r--fs/erofs/decompressor_zstd.c8
-rw-r--r--fs/erofs/dir.c9
-rw-r--r--fs/erofs/erofs_fs.h191
-rw-r--r--fs/erofs/fileio.c2
-rw-r--r--fs/erofs/fscache.c2
-rw-r--r--fs/erofs/inode.c125
-rw-r--r--fs/erofs/internal.h47
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c85
-rw-r--r--fs/erofs/sysfs.c2
-rw-r--r--fs/erofs/xattr.c12
-rw-r--r--fs/erofs/zdata.c102
-rw-r--r--fs/erofs/zmap.c286
-rw-r--r--fs/eventfd.c5
-rw-r--r--fs/eventpoll.c103
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/namei.c8
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext2/namei.c9
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h94
-rw-r--r--fs/ext4/ext4_jbd2.c12
-rw-r--r--fs/ext4/ext4_jbd2.h113
-rw-r--r--fs/ext4/extents.c531
-rw-r--r--fs/ext4/extents_status.c1
-rw-r--r--fs/ext4/file.c27
-rw-r--r--fs/ext4/fsync.c12
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/inline.c205
-rw-r--r--fs/ext4/inode.c292
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc-test.c2
-rw-r--r--fs/ext4/mballoc.c8
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/namei.c127
-rw-r--r--fs/ext4/orphan.c2
-rw-r--r--fs/ext4/page-io.c77
-rw-r--r--fs/ext4/resize.c4
-rw-r--r--fs/ext4/super.c267
-rw-r--r--fs/ext4/sysfs.c4
-rw-r--r--fs/ext4/xattr.c47
-rw-r--r--fs/ext4/xattr.h10
-rw-r--r--fs/f2fs/checkpoint.c71
-rw-r--r--fs/f2fs/compress.c1
-rw-r--r--fs/f2fs/data.c192
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c2
-rw-r--r--fs/f2fs/f2fs.h155
-rw-r--r--fs/f2fs/file.c126
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inline.c22
-rw-r--r--fs/f2fs/inode.c33
-rw-r--r--fs/f2fs/namei.c22
-rw-r--r--fs/f2fs/node.c450
-rw-r--r--fs/f2fs/node.h13
-rw-r--r--fs/f2fs/segment.c55
-rw-r--r--fs/f2fs/segment.h9
-rw-r--r--fs/f2fs/shrinker.c92
-rw-r--r--fs/f2fs/super.c191
-rw-r--r--fs/f2fs/sysfs.c139
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fat/namei_msdos.c8
-rw-r--r--fs/fat/namei_vfat.c8
-rw-r--r--fs/file.c133
-rw-r--r--fs/file_table.c73
-rw-r--r--fs/fs-writeback.c30
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/dir.c50
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c8
-rw-r--r--fs/gfs2/glock.c124
-rw-r--r--fs/gfs2/incore.h4
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/lops.c78
-rw-r--r--fs/gfs2/meta_io.c15
-rw-r--r--fs/gfs2/super.c24
-rw-r--r--fs/gfs2/trace_gfs2.h10
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/hfs/dir.c10
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hostfs/hostfs_kern.c16
-rw-r--r--fs/hpfs/namei.c10
-rw-r--r--fs/hugetlbfs/inode.c6
-rw-r--r--fs/init.c7
-rw-r--r--fs/inode.c127
-rw-r--r--fs/internal.h11
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/iomap/Makefile1
-rw-r--r--fs/iomap/buffered-io.c356
-rw-r--r--fs/iomap/direct-io.c279
-rw-r--r--fs/iomap/fiemap.c21
-rw-r--r--fs/iomap/internal.h10
-rw-r--r--fs/iomap/ioend.c216
-rw-r--r--fs/iomap/iter.c97
-rw-r--r--fs/iomap/seek.c16
-rw-r--r--fs/iomap/swapfile.c7
-rw-r--r--fs/iomap/trace.h8
-rw-r--r--fs/jbd2/commit.c10
-rw-r--r--fs/jbd2/journal.c34
-rw-r--r--fs/jbd2/recovery.c80
-rw-r--r--fs/jbd2/revoke.c21
-rw-r--r--fs/jbd2/transaction.c21
-rw-r--r--fs/jffs2/dir.c18
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_dmap.c39
-rw-r--r--fs/jfs/jfs_dtree.c3
-rw-r--r--fs/jfs/jfs_extent.c10
-rw-r--r--fs/jfs/jfs_imap.c17
-rw-r--r--fs/jfs/namei.c8
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/jfs/xattr.c15
-rw-r--r--fs/kernfs/dir.c12
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/mnt_idmapping.c51
-rw-r--r--fs/mount.h37
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c149
-rw-r--r--fs/namespace.c852
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs3proc.c29
-rw-r--r--fs/nfs/nfs4proc.c47
-rw-r--r--fs/nfs/proc.c12
-rw-r--r--fs/nfsd/nfs4recover.c7
-rw-r--r--fs/nfsd/vfs.c34
-rw-r--r--fs/nilfs2/namei.c8
-rw-r--r--fs/notify/fanotify/fanotify.c38
-rw-r--r--fs/notify/fanotify/fanotify.h18
-rw-r--r--fs/notify/fanotify/fanotify_user.c89
-rw-r--r--fs/notify/fdinfo.c5
-rw-r--r--fs/notify/fsnotify.c47
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/mark.c14
-rw-r--r--fs/nsfs.c32
-rw-r--r--fs/ntfs3/namei.c8
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c10
-rw-r--r--fs/ocfs2/namei.c10
-rw-r--r--fs/omfs/dir.c6
-rw-r--r--fs/open.c44
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c149
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-bufmap.c25
-rw-r--r--fs/orangefs/orangefs-bufmap.h3
-rw-r--r--fs/orangefs/orangefs-debug.h43
-rw-r--r--fs/orangefs/orangefs-debugfs.c43
-rw-r--r--fs/overlayfs/dir.c46
-rw-r--r--fs/overlayfs/overlayfs.h15
-rw-r--r--fs/overlayfs/params.c25
-rw-r--r--fs/overlayfs/super.c23
-rw-r--r--fs/pidfs.c247
-rw-r--r--fs/pipe.c181
-rw-r--r--fs/pnode.c14
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c55
-rw-r--r--fs/proc/kcore.c12
-rw-r--r--fs/pstore/inode.c111
-rw-r--r--fs/pstore/internal.h4
-rw-r--r--fs/pstore/platform.c11
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/file.c2
-rw-r--r--fs/smb/client/inode.c10
-rw-r--r--fs/smb/server/vfs.c58
-rw-r--r--fs/splice.c40
-rw-r--r--fs/super.c57
-rw-r--r--fs/sysv/Kconfig38
-rw-r--r--fs/sysv/Makefile9
-rw-r--r--fs/sysv/balloc.c240
-rw-r--r--fs/sysv/dir.c378
-rw-r--r--fs/sysv/file.c59
-rw-r--r--fs/sysv/ialloc.c235
-rw-r--r--fs/sysv/inode.c354
-rw-r--r--fs/sysv/itree.c511
-rw-r--r--fs/sysv/namei.c280
-rw-r--r--fs/sysv/super.c595
-rw-r--r--fs/sysv/sysv.h245
-rw-r--r--fs/timerfd.c11
-rw-r--r--fs/tracefs/inode.c10
-rw-r--r--fs/ubifs/dir.c10
-rw-r--r--fs/ubifs/io.c3
-rw-r--r--fs/udf/namei.c12
-rw-r--r--fs/ufs/namei.c8
-rw-r--r--fs/unicode/Kconfig5
-rw-r--r--fs/unicode/Makefile2
-rw-r--r--fs/unicode/tests/.kunitconfig3
-rw-r--r--fs/unicode/tests/utf8_kunit.c (renamed from fs/unicode/utf8-selftest.c)153
-rw-r--r--fs/unicode/utf8-norm.c2
-rw-r--r--fs/vboxsf/dir.c8
-rw-r--r--fs/verity/Kconfig8
-rw-r--r--fs/xfs/Makefile7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c316
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_format.h20
-rw-r--r--fs/xfs/libxfs/xfs_fs.h14
-rw-r--r--fs/xfs/libxfs/xfs_group.h31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c23
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c167
-rw-r--r--fs/xfs/libxfs/xfs_metafile.h6
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h6
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c39
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h50
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c19
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c81
-rw-r--r--fs/xfs/libxfs/xfs_types.h28
-rw-r--r--fs/xfs/libxfs/xfs_zones.c186
-rw-r--r--fs/xfs/libxfs/xfs_zones.h35
-rw-r--r--fs/xfs/scrub/agheader.c2
-rw-r--r--fs/xfs/scrub/bmap.c4
-rw-r--r--fs/xfs/scrub/fscounters.c22
-rw-r--r--fs/xfs/scrub/fscounters_repair.c12
-rw-r--r--fs/xfs/scrub/inode.c7
-rw-r--r--fs/xfs/scrub/inode_repair.c7
-rw-r--r--fs/xfs/scrub/newbt.c2
-rw-r--r--fs/xfs/scrub/orphanage.c9
-rw-r--r--fs/xfs/scrub/reap.c9
-rw-r--r--fs/xfs/scrub/repair.c37
-rw-r--r--fs/xfs/scrub/rtbitmap.c11
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c34
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c29
-rw-r--r--fs/xfs/scrub/scrub.c2
-rw-r--r--fs/xfs/xfs_aops.c194
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c32
-rw-r--r--fs/xfs/xfs_bmap_util.h12
-rw-r--r--fs/xfs/xfs_buf.c558
-rw-r--r--fs/xfs/xfs_buf.h29
-rw-r--r--fs/xfs/xfs_buf_item.c114
-rw-r--r--fs/xfs/xfs_buf_item_recover.c8
-rw-r--r--fs/xfs/xfs_buf_mem.c43
-rw-r--r--fs/xfs/xfs_buf_mem.h6
-rw-r--r--fs/xfs/xfs_discard.c3
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c35
-rw-r--r--fs/xfs/xfs_file.c351
-rw-r--r--fs/xfs/xfs_fsmap.c86
-rw-r--r--fs/xfs/xfs_fsops.c67
-rw-r--r--fs/xfs/xfs_fsops.h3
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode.h28
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_inode_item_recover.c1
-rw-r--r--fs/xfs/xfs_ioctl.c12
-rw-r--r--fs/xfs/xfs_iomap.c532
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c35
-rw-r--r--fs/xfs/xfs_log.c4
-rw-r--r--fs/xfs/xfs_message.c4
-rw-r--r--fs/xfs/xfs_message.h1
-rw-r--r--fs/xfs/xfs_mount.c212
-rw-r--r--fs/xfs/xfs_mount.h131
-rw-r--r--fs/xfs/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_reflink.c18
-rw-r--r--fs/xfs/xfs_rtalloc.c244
-rw-r--r--fs/xfs/xfs_rtalloc.h5
-rw-r--r--fs/xfs/xfs_super.c168
-rw-r--r--fs/xfs/xfs_sysfs.c75
-rw-r--r--fs/xfs/xfs_sysfs.h5
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h218
-rw-r--r--fs/xfs/xfs_zone_alloc.c1220
-rw-r--r--fs/xfs/xfs_zone_alloc.h70
-rw-r--r--fs/xfs/xfs_zone_gc.c1165
-rw-r--r--fs/xfs/xfs_zone_info.c105
-rw-r--r--fs/xfs/xfs_zone_priv.h119
-rw-r--r--fs/xfs/xfs_zone_space_resv.c263
-rw-r--r--fs/zonefs/file.c2
529 files changed, 20722 insertions, 15551 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3e68521f4e2f..399d455d50d6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir,
*
*/
-static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int err;
u32 perm;
@@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (fid)
p9_fid_put(fid);
-
- return err;
+ return ERR_PTR(err);
}
/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 143ac03b7425..cc2007be2173 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -350,9 +350,9 @@ out:
*
*/
-static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t omode)
+static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t omode)
{
int err;
struct v9fs_session_info *v9ses;
@@ -417,7 +417,7 @@ error:
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
p9_fid_put(dfid);
- return err;
+ return ERR_PTR(err);
}
static int
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d420e3c475..afe21866d6b4 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -336,7 +336,6 @@ source "fs/qnx4/Kconfig"
source "fs/qnx6/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
-source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/erofs/Kconfig"
source "fs/vboxsf/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 15df0a923d3a..77fd7f7b5d02 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-y += unicode/
-obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS) += smb/
obj-$(CONFIG_HPFS_FS) += hpfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e8c2c4535cb3..ac4e9a02910b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
extern int affs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool);
-extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 8c154490a2d6..f883be50db12 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-int
+struct dentry *
affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
@@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = affs_new_inode(dir);
if (!inode)
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
inode->i_mode = S_IFDIR | mode;
affs_mode_to_prot(inode);
@@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
- return error;
+ return ERR_PTR(error);
}
- return 0;
+ return NULL;
}
int
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 6d42f85c6be5..e941da5b6dd9 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
alist->nr_addrs++;
return 0;
}
+
+/*
+ * Set the app data on the rxrpc peers an address list points to
+ */
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist)
+{
+ unsigned long data = (unsigned long)server;
+ int n = 0, o = 0;
+
+ if (!old_alist) {
+ /* New server. Just set all. */
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ return;
+ }
+ if (!new_alist) {
+ /* Dead server. Just remove all. */
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+ return;
+ }
+
+ /* Walk through the two lists simultaneously, setting new peers and
+ * clearing old ones. The two lists are ordered by pointer to peer
+ * record.
+ */
+ while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) {
+ struct rxrpc_peer *pn = new_alist->addrs[n].peer;
+ struct rxrpc_peer *po = old_alist->addrs[o].peer;
+
+ if (pn == po)
+ continue;
+ if (pn < po) {
+ rxrpc_kernel_set_peer_data(pn, data);
+ n++;
+ } else {
+ rxrpc_kernel_set_peer_data(po, 0);
+ o++;
+ }
+ }
+
+ if (n < new_alist->nr_addrs)
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ if (o < old_alist->nr_addrs)
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 96a6781f3653..0168bbf53fe0 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
static atomic_t cell_debug_id;
-static void afs_queue_cell_manager(struct afs_net *);
-static void afs_manage_cell_work(struct work_struct *);
+static void afs_cell_timer(struct timer_list *timer);
+static void afs_destroy_cell_work(struct work_struct *work);
+static void afs_manage_cell_work(struct work_struct *work);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
@@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net)
wake_up_var(&net->cells_outstanding);
}
-/*
- * Set the cell timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
+static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state)
{
- if (net->live) {
- atomic_inc(&net->cells_outstanding);
- if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
- afs_dec_cells_outstanding(net);
- } else {
- afs_queue_cell_manager(net);
- }
+ smp_store_release(&cell->state, state); /* Commit cell changes before state */
+ smp_wmb(); /* Set cell state before task state */
+ wake_up_var(&cell->state);
}
/*
@@ -116,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
const char *name, unsigned int namelen,
const char *addresses)
{
- struct afs_vlserver_list *vllist;
+ struct afs_vlserver_list *vllist = NULL;
struct afs_cell *cell;
int i, ret;
@@ -163,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->net = net;
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
+ INIT_WORK(&cell->destroyer, afs_destroy_cell_work);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ timer_setup(&cell->management_timer, afs_cell_timer, 0);
init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
- seqlock_init(&cell->fs_lock);
+ init_rwsem(&cell->fs_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -204,7 +199,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->dns_status = vllist->status;
smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
atomic_inc(&net->cells_outstanding);
+ ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell,
+ 2, INT_MAX / 2, GFP_KERNEL);
+ if (ret < 0)
+ goto error;
+ cell->dynroot_ino = ret;
cell->debug_id = atomic_inc_return(&cell_debug_id);
+
trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
_leave(" = %p", cell);
@@ -214,6 +215,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ afs_put_vlserverlist(cell->net, vllist);
kfree(cell->name - 1);
kfree(cell);
_leave(" = %d", ret);
@@ -227,6 +229,7 @@ error:
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
* @excl: T if an error should be given if the cell name already exists.
+ * @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
* needed. Note that that actual DNS query is punted off to the manager thread
@@ -235,7 +238,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl)
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
struct rb_node *parent, **pp;
@@ -245,7 +249,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
_enter("%s,%s", name, vllist);
if (!excl) {
- cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
+ cell = afs_find_cell(net, name, namesz, trace);
if (!IS_ERR(cell))
goto wait_for_cell;
}
@@ -288,26 +292,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
- atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
+ afs_use_cell(cell, trace);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_get_queue_new);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
- afs_cell_trace_wait);
_debug("wait_for_cell");
- wait_var_event(&cell->state,
- ({
- state = smp_load_acquire(&cell->state); /* vs error */
- state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
- }));
+ state = smp_load_acquire(&cell->state); /* vs error */
+ if (state != AFS_CELL_ACTIVE &&
+ state != AFS_CELL_DEAD) {
+ afs_see_cell(cell, afs_cell_trace_wait);
+ wait_var_event(&cell->state,
+ ({
+ state = smp_load_acquire(&cell->state); /* vs error */
+ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
+ }));
+ }
/* Check the state obtained from the wait check. */
- if (state == AFS_CELL_REMOVED) {
+ if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
@@ -321,7 +327,7 @@ cell_already_exists:
if (excl) {
ret = -EEXIST;
} else {
- afs_use_cell(cursor, afs_cell_trace_use_lookup);
+ afs_use_cell(cursor, trace);
ret = 0;
}
up_write(&net->cells_lock);
@@ -331,7 +337,7 @@ cell_already_exists:
goto wait_for_cell;
goto error_noput;
error:
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error);
error_noput:
_leave(" = %d [error]", ret);
return ERR_PTR(ret);
@@ -376,8 +382,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
if (cp && cp < rootcell + len)
return -EINVAL;
- /* allocate a cell record for the root cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
+ /* allocate a cell record for the root/workstation cell */
+ new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
return PTR_ERR(new_root);
@@ -388,12 +395,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
/* install the new cell */
down_write(&net->cells_lock);
- afs_see_cell(new_root, afs_cell_trace_see_ws);
old_root = rcu_replace_pointer(net->ws_cell, new_root,
lockdep_is_held(&net->cells_lock));
up_write(&net->cells_lock);
- afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
+ afs_unuse_cell(old_root, afs_cell_trace_unuse_ws);
_leave(" = 0");
return 0;
}
@@ -511,8 +517,9 @@ static void afs_cell_destroy(struct rcu_head *rcu)
trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
- afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
+ afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias);
key_put(cell->anonymous_key);
+ idr_remove(&net->cells_dyn_ino, cell->dynroot_ino);
kfree(cell->name - 1);
kfree(cell);
@@ -520,30 +527,14 @@ static void afs_cell_destroy(struct rcu_head *rcu)
_leave(" [destroyed]");
}
-/*
- * Queue the cell manager.
- */
-static void afs_queue_cell_manager(struct afs_net *net)
-{
- int outstanding = atomic_inc_return(&net->cells_outstanding);
-
- _enter("%d", outstanding);
-
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
-}
-
-/*
- * Cell management timer. We have an increment on cells_outstanding that we
- * need to pass along to the work item.
- */
-void afs_cells_timer(struct timer_list *timer)
+static void afs_destroy_cell_work(struct work_struct *work)
{
- struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
+ struct afs_cell *cell = container_of(work, struct afs_cell, destroyer);
- _enter("");
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
+ afs_see_cell(cell, afs_cell_trace_destroy);
+ timer_delete_sync(&cell->management_timer);
+ cancel_work_sync(&cell->manager);
+ call_rcu(&cell->rcu, afs_cell_destroy);
}
/*
@@ -575,7 +566,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
- call_rcu(&cell->rcu, afs_cell_destroy);
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
}
}
@@ -587,10 +578,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
int r, a;
- r = refcount_read(&cell->ref);
- WARN_ON(r == 0);
+ __refcount_inc(&cell->ref, &r);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, r, a, reason);
+ trace_afs_cell(cell->debug_id, r + 1, a, reason);
return cell;
}
@@ -598,10 +588,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
* Record a cell becoming less active. When the active counter reaches 1, it
* is scheduled for destruction, but may get reactivated.
*/
-void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
unsigned int debug_id;
time64_t now, expire_delay;
+ bool zero;
int r, a;
if (!cell)
@@ -616,13 +607,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, r, a, reason);
- WARN_ON(a == 0);
- if (a == 1)
+ if (!a)
/* 'cell' may now be garbage collected. */
- afs_set_cell_timer(net, expire_delay);
+ afs_set_cell_timer(cell, expire_delay);
+
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero)
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
/*
@@ -642,9 +635,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- afs_get_cell(cell, reason);
- if (!queue_work(afs_wq, &cell->manager))
- afs_put_cell(cell, afs_cell_trace_put_queue_fail);
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Cell-specific management timer.
+ */
+static void afs_cell_timer(struct timer_list *timer)
+{
+ struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer);
+
+ afs_see_cell(cell, afs_cell_trace_see_mgmt_timer);
+ if (refcount_read(&cell->ref) > 0 && cell->net->live)
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Set/reduce the cell timer.
+ */
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
+{
+ timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ);
}
/*
@@ -706,7 +717,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
if (cell->proc_link.next)
cell->proc_link.next->pprev = &cell->proc_link.next;
- afs_dynroot_mkdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
return 0;
}
@@ -723,217 +733,130 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
mutex_lock(&net->proc_cells_lock);
if (!hlist_unhashed(&cell->proc_link))
hlist_del_rcu(&cell->proc_link);
- afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
_leave("");
}
+static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage)
+{
+ const struct afs_vlserver_list *vllist;
+ time64_t expire_at = cell->last_inactive;
+ time64_t now = ktime_get_real_seconds();
+
+ if (atomic_read(&cell->active))
+ return false;
+ if (!cell->net->live)
+ return true;
+
+ vllist = rcu_dereference_protected(cell->vl_servers, true);
+ if (vllist && vllist->nr_servers > 0)
+ expire_at += afs_cell_gc_delay;
+
+ if (expire_at <= now)
+ return true;
+ if (expire_at < *_next_manage)
+ *_next_manage = expire_at;
+ return false;
+}
+
/*
* Manage a cell record, initialising and destroying it, maintaining its DNS
* records.
*/
-static void afs_manage_cell(struct afs_cell *cell)
+static bool afs_manage_cell(struct afs_cell *cell)
{
struct afs_net *net = cell->net;
- int ret, active;
+ time64_t next_manage = TIME64_MAX;
+ int ret;
_enter("%s", cell->name);
-again:
_debug("state %u", cell->state);
switch (cell->state) {
- case AFS_CELL_INACTIVE:
- case AFS_CELL_FAILED:
- down_write(&net->cells_lock);
- active = 1;
- if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
- rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
- afs_cell_trace_unuse_delete);
- smp_store_release(&cell->state, AFS_CELL_REMOVED);
- }
- up_write(&net->cells_lock);
- if (cell->state == AFS_CELL_REMOVED) {
- wake_up_var(&cell->state);
- goto final_destruction;
- }
- if (cell->state == AFS_CELL_FAILED)
- goto done;
- smp_store_release(&cell->state, AFS_CELL_UNSET);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_UNSET:
- smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_ACTIVATING:
- ret = afs_activate_cell(net, cell);
- if (ret < 0)
- goto activation_failed;
+ case AFS_CELL_SETTING_UP:
+ goto set_up_cell;
+ case AFS_CELL_ACTIVE:
+ goto cell_is_active;
+ case AFS_CELL_REMOVING:
+ WARN_ON_ONCE(1);
+ return false;
+ case AFS_CELL_DEAD:
+ return false;
+ default:
+ _debug("bad state %u", cell->state);
+ WARN_ON_ONCE(1); /* Unhandled state */
+ return false;
+ }
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- goto again;
+set_up_cell:
+ ret = afs_activate_cell(net, cell);
+ if (ret < 0) {
+ cell->error = ret;
+ goto remove_cell;
+ }
- case AFS_CELL_ACTIVE:
- if (atomic_read(&cell->active) > 1) {
- if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
- ret = afs_update_cell(cell);
- if (ret < 0)
- cell->error = ret;
- }
- goto done;
- }
- smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
- wake_up_var(&cell->state);
- goto again;
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
- case AFS_CELL_DEACTIVATING:
- if (atomic_read(&cell->active) > 1)
- goto reverse_deactivation;
- afs_deactivate_cell(net, cell);
- smp_store_release(&cell->state, AFS_CELL_INACTIVE);
- wake_up_var(&cell->state);
- goto again;
+cell_is_active:
+ if (afs_has_cell_expired(cell, &next_manage))
+ goto remove_cell;
- case AFS_CELL_REMOVED:
- goto done;
+ if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+ ret = afs_update_cell(cell);
+ if (ret < 0)
+ cell->error = ret;
+ }
- default:
- break;
+ if (next_manage < TIME64_MAX && cell->net->live) {
+ time64_t now = ktime_get_real_seconds();
+
+ if (next_manage - now <= 0)
+ afs_queue_cell(cell, afs_cell_trace_queue_again);
+ else
+ afs_set_cell_timer(cell, next_manage - now);
}
- _debug("bad state %u", cell->state);
- BUG(); /* Unhandled state */
+ _leave(" [done %u]", cell->state);
+ return false;
-activation_failed:
- cell->error = ret;
- afs_deactivate_cell(net, cell);
+remove_cell:
+ down_write(&net->cells_lock);
- smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
- wake_up_var(&cell->state);
- goto again;
+ if (atomic_read(&cell->active)) {
+ up_write(&net->cells_lock);
+ goto cell_is_active;
+ }
-reverse_deactivation:
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- _leave(" [deact->act]");
- return;
+ /* Make sure that the expiring server records are going to see the fact
+ * that the cell is caput.
+ */
+ afs_set_cell_state(cell, AFS_CELL_REMOVING);
-done:
- _leave(" [done %u]", cell->state);
- return;
+ afs_deactivate_cell(net, cell);
+ afs_purge_servers(cell);
+
+ rb_erase(&cell->net_node, &net->cells);
+ afs_see_cell(cell, afs_cell_trace_unuse_delete);
+ up_write(&net->cells_lock);
-final_destruction:
/* The root volume is pinning the cell */
afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
- afs_put_cell(cell, afs_cell_trace_put_destroy);
+
+ afs_set_cell_state(cell, AFS_CELL_DEAD);
+ return true;
}
static void afs_manage_cell_work(struct work_struct *work)
{
struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+ bool final_put;
- afs_manage_cell(cell);
- afs_put_cell(cell, afs_cell_trace_put_queue_work);
-}
-
-/*
- * Manage the records of cells known to a network namespace. This includes
- * updating the DNS records and garbage collecting unused cells that were
- * automatically added.
- *
- * Note that constructed cell records may only be removed from net->cells by
- * this work item, so it is safe for this work item to stash a cursor pointing
- * into the tree and then return to caller (provided it skips cells that are
- * still under construction).
- *
- * Note also that we were given an increment on net->cells_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_cells(struct work_struct *work)
-{
- struct afs_net *net = container_of(work, struct afs_net, cells_manager);
- struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
-
- _enter("");
-
- /* Trawl the cell database looking for cells that have expired from
- * lack of use and cells whose DNS results have expired and dispatch
- * their managers.
- */
- down_read(&net->cells_lock);
-
- for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
- struct afs_cell *cell =
- rb_entry(cursor, struct afs_cell, net_node);
- unsigned active;
- bool sched_cell = false;
-
- active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_manage);
-
- ASSERTCMP(active, >=, 1);
-
- if (purging) {
- if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
- active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_unuse_pin);
- }
- }
-
- if (active == 1) {
- struct afs_vlserver_list *vllist;
- time64_t expire_at = cell->last_inactive;
-
- read_lock(&cell->vl_servers_lock);
- vllist = rcu_dereference_protected(
- cell->vl_servers,
- lockdep_is_held(&cell->vl_servers_lock));
- if (vllist->nr_servers > 0)
- expire_at += afs_cell_gc_delay;
- read_unlock(&cell->vl_servers_lock);
- if (purging || expire_at <= now)
- sched_cell = true;
- else if (expire_at < next_manage)
- next_manage = expire_at;
- }
-
- if (!purging) {
- if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
- sched_cell = true;
- }
-
- if (sched_cell)
- afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
- }
-
- up_read(&net->cells_lock);
-
- /* Update the timer on the way out. We have to pass an increment on
- * cells_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
-
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->cells_manager))
- atomic_inc(&net->cells_outstanding);
- } else {
- afs_set_cell_timer(net, next_manage - now);
- }
- }
-
- afs_dec_cells_outstanding(net);
- _leave(" [%d]", atomic_read(&net->cells_outstanding));
+ afs_see_cell(cell, afs_cell_trace_manage);
+ final_put = afs_manage_cell(cell);
+ afs_see_cell(cell, afs_cell_trace_managed);
+ if (final_put)
+ afs_put_cell(cell, afs_cell_trace_put_final);
}
/*
@@ -942,6 +865,7 @@ void afs_manage_cells(struct work_struct *work)
void afs_cell_purge(struct afs_net *net)
{
struct afs_cell *ws;
+ struct rb_node *cursor;
_enter("");
@@ -949,14 +873,21 @@ void afs_cell_purge(struct afs_net *net)
ws = rcu_replace_pointer(net->ws_cell, NULL,
lockdep_is_held(&net->cells_lock));
up_write(&net->cells_lock);
- afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
+ afs_unuse_cell(ws, afs_cell_trace_unuse_ws);
- _debug("del timer");
- if (del_timer_sync(&net->cells_timer))
- atomic_dec(&net->cells_outstanding);
+ _debug("kick cells");
+ down_read(&net->cells_lock);
+ for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
+ struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node);
+
+ afs_see_cell(cell, afs_cell_trace_purge);
- _debug("kick mgr");
- afs_queue_cell_manager(net);
+ if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+ afs_unuse_cell(cell, afs_cell_trace_unuse_pin);
+
+ afs_queue_cell(cell, afs_cell_trace_queue_purge);
+ }
+ up_read(&net->cells_lock);
_debug("wait");
wait_var_event(&net->cells_outstanding,
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 99a3f20bc786..1a906805a9e3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call)
}
/*
- * Find the server record by peer address and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_peer(struct afs_call *call)
-{
- struct sockaddr_rxrpc srx;
- struct afs_server *server;
- struct rxrpc_peer *peer;
-
- peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
-
- server = afs_find_server(call->net, peer);
- if (!server) {
- trace_afs_cm_no_server(call, &srx);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
- * Find the server record by server UUID and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_uuid(struct afs_call *call,
- struct afs_uuid *uuid)
-{
- struct afs_server *server;
-
- rcu_read_lock();
- server = afs_find_server_by_uuid(call->net, call->request);
- rcu_read_unlock();
- if (!server) {
- trace_afs_cm_no_server_u(call, call->request);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
* Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
@@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
*/
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
- int ret;
-
_enter("");
afs_extract_discard(call, 0);
- ret = afs_extract_data(call, false);
- if (ret < 0)
- return ret;
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return afs_extract_data(call, false);
}
/*
@@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
__be32 *b;
int ret;
- _enter("");
-
_enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
@@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_uuid(call, call->request);
+ if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) {
+ pr_notice("Callback UUID does not match fileserver UUID\n");
+ trace_afs_cm_no_server_u(call, call->request);
+ return 0;
+ }
+
+ return 0;
}
/*
@@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* We'll need the file server record as that tells us which set of
- * vnodes to operate upon.
- */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 02cbf38e1a77..9e7b1fe82c27 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode);
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode);
static int afs_rmdir(struct inode *dir, struct dentry *dentry);
static int afs_unlink(struct inode *dir, struct dentry *dentry);
static int afs_link(struct dentry *from, struct inode *dir,
@@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
afs_stat_v(dvnode, n_lookup);
inode = afs_do_lookup(dir, dentry);
if (inode == ERR_PTR(-ENOENT))
- inode = afs_try_auto_mntpt(dentry, dir);
-
- if (!IS_ERR_OR_NULL(inode))
+ inode = NULL;
+ else if (!IS_ERR_OR_NULL(inode))
fid = AFS_FS_I(inode)->fid;
_debug("splice %p", dentry->d_inode);
@@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = {
/*
* create a directory on an AFS filesystem
*/
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct afs_operation *op;
struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op = afs_alloc_operation(NULL, dvnode->volume);
if (IS_ERR(op)) {
d_drop(dentry);
- return PTR_ERR(op);
+ return ERR_CAST(op);
}
fscache_use_cookie(afs_vnode_cache(dvnode), true);
@@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op->ops = &afs_mkdir_operation;
ret = afs_do_sync_operation(op);
afs_dir_unuse_cookie(dvnode, ret);
- return ret;
+ return ERR_PTR(ret);
}
/*
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 7d997f7a8028..691e0ae607a1 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -10,16 +10,19 @@
#include <linux/dns_resolver.h>
#include "internal.h"
-static atomic_t afs_autocell_ino;
+#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */
+#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX)
+
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino);
/*
* iget5() comparator for inode created by autocell operations
- *
- * These pseudo inodes don't match anything.
*/
static int afs_iget5_pseudo_test(struct inode *inode, void *opaque)
{
- return 0;
+ struct afs_fid *fid = opaque;
+
+ return inode->i_ino == fid->vnode;
}
/*
@@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque)
}
/*
- * Create an inode for a dynamic root directory or an autocell dynamic
- * automount dir.
+ * Create an inode for an autocell dynamic automount dir.
*/
-struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
+static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
{
- struct afs_super_info *as = AFS_FS_S(sb);
struct afs_vnode *vnode;
struct inode *inode;
- struct afs_fid fid = {};
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
_enter("");
- if (as->volume)
- fid.vid = as->volume->vid;
- if (root) {
- fid.vnode = 1;
- fid.unique = 1;
- } else {
- fid.vnode = atomic_inc_return(&afs_autocell_ino);
- fid.unique = 0;
- }
-
inode = iget5_locked(sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
if (!inode) {
@@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- BUG_ON(!(inode->i_state & I_NEW));
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- inode->i_size = 0;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- if (root) {
- inode->i_op = &afs_dynroot_inode_operations;
- inode->i_fop = &simple_dir_operations;
- } else {
- inode->i_op = &afs_autocell_inode_operations;
- }
- set_nlink(inode, 2);
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- inode->i_generation = 0;
-
- set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
- if (!root) {
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_autocell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
- inode->i_flags |= S_AUTOMOUNT;
- }
- inode->i_flags |= S_NOATIME;
- unlock_new_inode(inode);
+ unlock_new_inode(inode);
+ }
_leave(" = %p", inode);
return inode;
}
/*
- * Probe to see if a cell may exist. This prevents positive dentries from
- * being created unnecessarily.
+ * Try to automount the mountpoint with pseudo directory, if the autocell
+ * option is set.
*/
-static int afs_probe_cell_name(struct dentry *dentry)
+static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct afs_cell *cell;
+ struct afs_cell *cell = NULL;
struct afs_net *net = afs_d2net(dentry);
+ struct inode *inode = NULL;
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
- char *result = NULL;
- int ret;
+ bool dotted = false;
+ int ret = -ENOENT;
/* Names prefixed with a dot are R/W mounts. */
if (name[0] == '.') {
- if (len == 1)
- return -EINVAL;
name++;
len--;
+ dotted = true;
}
- cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
- if (!IS_ERR(cell)) {
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
- return 0;
- }
-
- ret = dns_query(net->net, "afsdb", name, len, "srv=1",
- &result, NULL, false);
- if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
- ret = -ENOENT;
- if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
- struct dns_server_list_v1_header *v1 = (void *)result;
-
- if (v1->hdr.zero == 0 &&
- v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
- v1->hdr.version == 1 &&
- (v1->status != DNS_LOOKUP_GOOD &&
- v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
- return -ENOENT;
-
+ cell = afs_lookup_cell(net, name, len, NULL, false,
+ afs_cell_trace_use_lookup_dynroot);
+ if (IS_ERR(cell)) {
+ ret = PTR_ERR(cell);
+ goto out_no_cell;
}
- kfree(result);
- return ret;
-}
-
-/*
- * Try to auto mount the mountpoint with pseudo directory, if the autocell
- * operation is setted.
- */
-struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
-{
- struct afs_vnode *vnode = AFS_FS_I(dir);
- struct inode *inode;
- int ret = -ENOENT;
-
- _enter("%p{%pd}, {%llx:%llu}",
- dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
-
- if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
- goto out;
-
- ret = afs_probe_cell_name(dentry);
- if (ret < 0)
- goto out;
-
- inode = afs_iget_pseudo_dir(dir->i_sb, false);
+ inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto out;
}
- _leave("= %p", inode);
- return inode;
+ dentry->d_fsdata = cell;
+ return d_splice_alias(inode, dentry);
out:
- _leave("= %d", ret);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot);
+out_no_cell:
+ if (!inode)
+ return d_splice_alias(inode, dentry);
return ret == -ENOENT ? NULL : ERR_PTR(ret);
}
@@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
{
_enter("%pd", dentry);
- ASSERTCMP(d_inode(dentry), ==, NULL);
-
if (flags & LOOKUP_CREATE)
return ERR_PTR(-EOPNOTSUPP);
@@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
return ERR_PTR(-ENAMETOOLONG);
}
- return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
+ if (dentry->d_name.len == 5 &&
+ memcmp(dentry->d_name.name, "@cell", 5) == 0)
+ return afs_lookup_atcell(dir, dentry, 2);
+
+ if (dentry->d_name.len == 6 &&
+ memcmp(dentry->d_name.name, ".@cell", 6) == 0)
+ return afs_lookup_atcell(dir, dentry, 3);
+
+ return afs_dynroot_lookup_cell(dir, dentry, flags);
}
const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_delete = always_delete_dentry,
- .d_release = afs_d_release,
- .d_automount = afs_d_automount,
-};
-
-/*
- * Create a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
- */
-int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
-{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *subdir, *dsubdir;
- char *dotname = cell->name - 1;
- int ret;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return 0;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- subdir = lookup_one_len(cell->name, root, cell->name_len);
- if (IS_ERR(subdir)) {
- ret = PTR_ERR(subdir);
- goto unlock;
- }
-
- dsubdir = lookup_one_len(dotname, root, cell->name_len + 1);
- if (IS_ERR(dsubdir)) {
- ret = PTR_ERR(dsubdir);
- dput(subdir);
- goto unlock;
- }
-
- /* Note that we're retaining extra refs on the dentries. */
- subdir->d_fsdata = (void *)1UL;
- dsubdir->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
-}
-
-static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len)
+static void afs_dynroot_d_release(struct dentry *dentry)
{
- struct dentry *subdir;
-
- /* Don't want to trigger a lookup call, which will re-add the cell */
- subdir = try_lookup_one_len(name, root, name_len);
- if (IS_ERR_OR_NULL(subdir)) {
- _debug("lookup %ld", PTR_ERR(subdir));
- return;
- }
+ struct afs_cell *cell = dentry->d_fsdata;
- _debug("rmdir %pd %u", subdir, d_count(subdir));
-
- if (subdir->d_fsdata) {
- _debug("unpin %u", d_count(subdir));
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- dput(subdir);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt);
}
/*
- * Remove a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
+ * Keep @cell symlink dentries around, but only keep cell autodirs when they're
+ * being used.
*/
-void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+static int afs_dynroot_delete_dentry(const struct dentry *dentry)
{
- struct super_block *sb = net->dynroot_sb;
- char *dotname = cell->name - 1;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return;
+ const struct qstr *name = &dentry->d_name;
- inode_lock(sb->s_root->d_inode);
- afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len);
- afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1);
- inode_unlock(sb->s_root->d_inode);
- _leave("");
+ if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0)
+ return 0;
+ if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0)
+ return 0;
+ return 1;
}
+const struct dentry_operations afs_dynroot_dentry_operations = {
+ .d_delete = afs_dynroot_delete_dentry,
+ .d_release = afs_dynroot_d_release,
+ .d_automount = afs_d_automount,
+};
+
static void afs_atcell_delayed_put_cell(void *arg)
{
struct afs_cell *cell = arg;
@@ -347,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = {
};
/*
- * Look up @cell or .@cell in a dynroot directory. This is a substitution for
- * the local cell name for the net namespace.
+ * Create an inode for the @cell or .@cell symlinks.
*/
-static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name)
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino)
{
struct afs_vnode *vnode;
- struct afs_fid fid = { .vnode = 2, .unique = 1, };
- struct dentry *dentry;
struct inode *inode;
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
- if (name[0] == '.')
- fid.vnode = 3;
-
- dentry = d_alloc_name(root, name);
- if (!dentry)
- return ERR_PTR(-ENOMEM);
-
- inode = iget5_locked(dentry->d_sb, fid.vnode,
+ inode = iget5_locked(dir->i_sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
- if (!inode) {
- dput(dentry);
+ if (!inode)
return ERR_PTR(-ENOMEM);
- }
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) {
- iput(inode);
- dput(dentry);
- return ERR_PTR(-EIO);
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 1);
+ inode->i_size = 0;
+ inode->i_mode = S_IFLNK | 0555;
+ inode->i_op = &afs_atcell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ unlock_new_inode(inode);
}
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- simple_inode_init_ts(inode);
- set_nlink(inode, 1);
- inode->i_size = 0;
- inode->i_mode = S_IFLNK | 0555;
- inode->i_op = &afs_atcell_inode_operations;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_blocks = 0;
- inode->i_generation = 0;
- inode->i_flags |= S_NOATIME;
-
- unlock_new_inode(inode);
- d_splice_alias(inode, dentry);
- return dentry;
+ return d_splice_alias(inode, dentry);
}
/*
- * Create @cell and .@cell symlinks.
+ * Transcribe the cell database into readdir content under the RCU read lock.
+ * Each cell produces two entries, one prefixed with a dot and one not.
*/
-static int afs_dynroot_symlink(struct afs_net *net)
+static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx)
{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *symlink, *dsymlink;
- int ret;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- symlink = afs_dynroot_create_symlink(root, "@cell");
- if (IS_ERR(symlink)) {
- ret = PTR_ERR(symlink);
- goto unlock;
- }
+ const struct afs_cell *cell;
+ loff_t newpos;
+
+ _enter("%llu", ctx->pos);
+
+ for (;;) {
+ unsigned int ix = ctx->pos >> 1;
+
+ cell = idr_get_next(&net->cells_dyn_ino, &ix);
+ if (!cell)
+ return 0;
+ if (READ_ONCE(cell->state) == AFS_CELL_REMOVING ||
+ READ_ONCE(cell->state) == AFS_CELL_DEAD) {
+ ctx->pos += 2;
+ ctx->pos &= ~1;
+ continue;
+ }
- dsymlink = afs_dynroot_create_symlink(root, ".@cell");
- if (IS_ERR(dsymlink)) {
- ret = PTR_ERR(dsymlink);
- dput(symlink);
- goto unlock;
- }
+ newpos = ix << 1;
+ if (newpos > ctx->pos)
+ ctx->pos = newpos;
- /* Note that we're retaining extra refs on the dentries. */
- symlink->d_fsdata = (void *)1UL;
- dsymlink->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
+ _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino);
+
+ if ((ctx->pos & 1) == 0) {
+ if (!dir_emit(ctx, cell->name, cell->name_len,
+ cell->dynroot_ino, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ if ((ctx->pos & 1) == 1) {
+ if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1,
+ cell->dynroot_ino + 1, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ }
+ return 0;
}
/*
- * Populate a newly created dynamic root with cell names.
+ * Read the AFS dynamic root directory. This produces a list of cellnames,
+ * dotted and undotted, along with @cell and .@cell links if configured.
*/
-int afs_dynroot_populate(struct super_block *sb)
+static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
{
- struct afs_cell *cell;
- struct afs_net *net = afs_sb2net(sb);
- int ret;
-
- mutex_lock(&net->proc_cells_lock);
+ struct afs_net *net = afs_d2net(file->f_path.dentry);
+ int ret = 0;
- net->dynroot_sb = sb;
- ret = afs_dynroot_symlink(net);
- if (ret < 0)
- goto error;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
- ret = afs_dynroot_mkdir(net, cell);
- if (ret < 0)
- goto error;
+ if (ctx->pos == 2) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, "@cell", 5, 2, DT_LNK))
+ return 0;
+ ctx->pos = 3;
+ }
+ if (ctx->pos == 3) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, ".@cell", 6, 3, DT_LNK))
+ return 0;
+ ctx->pos = 4;
}
- ret = 0;
-out:
- mutex_unlock(&net->proc_cells_lock);
+ if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
+ rcu_read_lock();
+ ret = afs_dynroot_readdir_cells(net, ctx);
+ rcu_read_unlock();
+ }
return ret;
-
-error:
- net->dynroot_sb = NULL;
- goto out;
}
+static const struct file_operations afs_dynroot_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = afs_dynroot_readdir,
+ .fsync = noop_fsync,
+};
+
/*
- * When a dynamic root that's in the process of being destroyed, depopulate it
- * of pinned directories.
+ * Create an inode for a dynamic root directory.
*/
-void afs_dynroot_depopulate(struct super_block *sb)
+struct inode *afs_dynroot_iget_root(struct super_block *sb)
{
- struct afs_net *net = afs_sb2net(sb);
- struct dentry *root = sb->s_root, *subdir;
-
- /* Prevent more subdirs from being created */
- mutex_lock(&net->proc_cells_lock);
- if (net->dynroot_sb == sb)
- net->dynroot_sb = NULL;
- mutex_unlock(&net->proc_cells_lock);
-
- if (root) {
- struct hlist_node *n;
- inode_lock(root->d_inode);
-
- /* Remove all the pins for dirs created for manually added cells */
- hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
- if (subdir->d_fsdata) {
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- }
+ struct afs_super_info *as = AFS_FS_S(sb);
+ struct afs_vnode *vnode;
+ struct inode *inode;
+ struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,};
+
+ if (as->volume)
+ fid.vid = as->volume->vid;
- inode_unlock(root->d_inode);
+ inode = iget5_locked(sb, fid.vnode,
+ afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ vnode = AFS_FS_I(inode);
+
+ /* there shouldn't be an existing inode */
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_dynroot_inode_operations;
+ inode->i_fop = &afs_dynroot_file_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+ unlock_new_inode(inode);
}
+ _leave(" = %p", inode);
+ return inode;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b516d05b0fef..07a8bfbdd9b9 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -235,20 +235,20 @@ out:
* Probe all of a fileserver's addresses to find out the best route and to
* query its capabilities.
*/
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_alist, struct key *key)
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key)
{
struct afs_endpoint_state *estate, *old;
- struct afs_addr_list *alist;
+ struct afs_addr_list *old_alist = NULL, *alist;
unsigned long unprobed;
_enter("%pU", &server->uuid);
estate = kzalloc(sizeof(*estate), GFP_KERNEL);
if (!estate)
- return;
+ return -ENOMEM;
- refcount_set(&estate->ref, 1);
+ refcount_set(&estate->ref, 2);
estate->server_id = server->debug_id;
estate->rtt = UINT_MAX;
@@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
old = rcu_dereference_protected(server->endpoint_state,
lockdep_is_held(&server->fs_lock));
- estate->responsive_set = old->responsive_set;
- estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
- afs_alist_trace_get_estate);
+ if (old) {
+ estate->responsive_set = old->responsive_set;
+ if (!new_alist)
+ new_alist = old->addresses;
+ }
+
+ if (old_alist != new_alist)
+ afs_set_peer_appdata(server, old_alist, new_alist);
+
+ estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate);
alist = estate->addresses;
estate->probe_seq = ++server->probe_counter;
atomic_set(&estate->nr_probing, alist->nr_addrs);
+ if (new_alist)
+ server->addr_version = new_alist->version;
rcu_assign_pointer(server->endpoint_state, estate);
- set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
write_unlock(&server->fs_lock);
+ if (old)
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
afs_estate_trace_alloc_probe);
- afs_get_address_preferences(net, alist);
+ afs_get_address_preferences(net, new_alist);
server->probed_at = jiffies;
unprobed = (1UL << alist->nr_addrs) - 1;
@@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
}
afs_put_endpoint_state(old, afs_estate_trace_put_probe);
+ afs_put_endpoint_state(estate, afs_estate_trace_put_probe);
+ return 0;
}
/*
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d9ecd5418d8..bc9556991d7c 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
- call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+ call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb);
afs_make_call(call, GFP_NOFS);
afs_wait_for_call_to_complete(call);
ret = call->error;
@@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
return false;
call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->server = afs_use_server(server, false, afs_server_trace_use_get_caps);
call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
call->probe_index = addr_index;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index df30bd62da79..440b0e731093 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -287,9 +287,8 @@ struct afs_net {
/* Cell database */
struct rb_root cells;
+ struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */
struct afs_cell __rcu *ws_cell;
- struct work_struct cells_manager;
- struct timer_list cells_timer;
atomic_t cells_outstanding;
struct rw_semaphore cells_lock;
struct mutex cells_alias_lock;
@@ -301,18 +300,11 @@ struct afs_net {
* cell, but in practice, people create aliases and subsets and there's
* no easy way to distinguish them.
*/
- seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */
- struct rb_root fs_servers; /* afs_server (by server UUID or address) */
+ seqlock_t fs_lock; /* For fs_probe_*, fs_proc */
struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
- struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
- seqlock_t fs_addr_lock; /* For fs_addresses[46] */
-
- struct work_struct fs_manager;
- struct timer_list fs_timer;
-
struct work_struct fs_prober;
struct timer_list fs_probe_timer;
atomic_t servers_outstanding;
@@ -345,13 +337,10 @@ struct afs_net {
extern const char afs_init_sysname[];
enum afs_cell_state {
- AFS_CELL_UNSET,
- AFS_CELL_ACTIVATING,
+ AFS_CELL_SETTING_UP,
AFS_CELL_ACTIVE,
- AFS_CELL_DEACTIVATING,
- AFS_CELL_INACTIVE,
- AFS_CELL_FAILED,
- AFS_CELL_REMOVED,
+ AFS_CELL_REMOVING,
+ AFS_CELL_DEAD,
};
/*
@@ -382,7 +371,9 @@ struct afs_cell {
struct afs_cell *alias_of; /* The cell this is an alias of */
struct afs_volume *root_volume; /* The root.cell volume if there is one */
struct key *anonymous_key; /* anonymous user key for this cell */
+ struct work_struct destroyer; /* Destroyer for cell */
struct work_struct manager; /* Manager for init/deinit/dns */
+ struct timer_list management_timer; /* General management timer */
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
@@ -398,6 +389,7 @@ struct afs_cell {
enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */
unsigned int dns_lookup_count; /* Counter of DNS lookups */
unsigned int debug_id;
+ unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */
/* The volumes belonging to this cell */
struct rw_semaphore vs_lock; /* Lock for server->volumes */
@@ -407,7 +399,7 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
- seqlock_t fs_lock; /* For fs_servers */
+ struct rw_semaphore fs_lock; /* For fs_servers */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -542,22 +534,22 @@ struct afs_server {
};
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
- struct rb_node uuid_rb; /* Link in net->fs_servers */
- struct afs_server __rcu *uuid_next; /* Next server with same UUID */
- struct afs_server *uuid_prev; /* Previous server with same UUID */
- struct list_head probe_link; /* Link in net->fs_probe_list */
- struct hlist_node addr_link; /* Link in net->fs_addresses6 */
+ struct rb_node uuid_rb; /* Link in cell->fs_servers */
+ struct list_head probe_link; /* Link in net->fs_probe_* */
struct hlist_node proc_link; /* Link in net->fs_proc */
struct list_head volumes; /* RCU list of afs_server_entry objects */
- struct afs_server *gc_next; /* Next server in manager's list */
+ struct work_struct destroyer; /* Work item to try and destroy a server */
+ struct timer_list timer; /* Management timer */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
#define AFS_SERVER_FL_UPDATING 1
#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */
-#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */
-#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */
-#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */
+#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */
+#define AFS_SERVER_FL_CREATING 4 /* The record is being created */
+#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */
+#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
@@ -567,6 +559,7 @@ struct afs_server {
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
u16 service_id; /* Service ID we're using. */
+ short create_error; /* Creation error */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
@@ -621,6 +614,7 @@ struct afs_volume {
afs_volid_t vid; /* The volume ID of this volume */
afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
+ unsigned int debug_id; /* Debugging ID for traces */
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -700,7 +694,6 @@ struct afs_vnode {
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
-#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
@@ -1008,6 +1001,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
__be32 xdr, u16 port);
extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
__be32 *xdr, u16 port);
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist);
/*
* addr_prefs.c
@@ -1044,16 +1040,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
-extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
- const char *, bool);
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_manage_cells(struct work_struct *);
-extern void afs_cells_timer(struct timer_list *);
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs);
extern void __net_exit afs_cell_purge(struct afs_net *);
/*
@@ -1111,11 +1108,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *);
extern const struct inode_operations afs_dynroot_inode_operations;
extern const struct dentry_operations afs_dynroot_dentry_operations;
-extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
-extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *);
-extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *);
-extern int afs_dynroot_populate(struct super_block *);
-extern void afs_dynroot_depopulate(struct super_block *);
+struct inode *afs_dynroot_iget_root(struct super_block *sb);
/*
* file.c
@@ -1207,8 +1200,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est
enum afs_estate_trace where);
void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_addrs, struct key *key);
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key);
int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
@@ -1228,7 +1221,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
-extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
@@ -1510,20 +1502,30 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
-extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
-extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace);
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason);
+void afs_unuse_server(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
+void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_manage_servers(struct work_struct *);
-extern void afs_servers_timer(struct timer_list *);
+void afs_purge_servers(struct afs_cell *cell);
extern void afs_fs_probe_timer(struct timer_list *);
-extern void __net_exit afs_purge_servers(struct afs_net *);
+void __net_exit afs_wait_for_servers(struct afs_net *net);
bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
+static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace)
+{
+ int r = refcount_read(&server->ref);
+ int a = atomic_read(&server->active);
+
+ trace_afs_server(server->debug_id, r, a, trace);
+
+}
+
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
atomic_inc(&net->servers_outstanding);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1ae0067f772d..c845c5daaeba 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -76,25 +76,17 @@ static int __net_init afs_net_init(struct net *net_ns)
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
+ idr_init(&net->cells_dyn_ino);
init_rwsem(&net->cells_lock);
- INIT_WORK(&net->cells_manager, afs_manage_cells);
- timer_setup(&net->cells_timer, afs_cells_timer, 0);
-
mutex_init(&net->cells_alias_lock);
mutex_init(&net->proc_cells_lock);
INIT_HLIST_HEAD(&net->proc_cells);
seqlock_init(&net->fs_lock);
- net->fs_servers = RB_ROOT;
INIT_LIST_HEAD(&net->fs_probe_fast);
INIT_LIST_HEAD(&net->fs_probe_slow);
INIT_HLIST_HEAD(&net->fs_proc);
- INIT_HLIST_HEAD(&net->fs_addresses);
- seqlock_init(&net->fs_addr_lock);
-
- INIT_WORK(&net->fs_manager, afs_manage_servers);
- timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
atomic_set(&net->servers_outstanding, 1);
@@ -130,13 +122,14 @@ error_open_socket:
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
error_cell_init:
net->live = false;
afs_proc_cleanup(net);
error_proc:
afs_put_sysnames(net->sysnames);
error_sysnames:
+ idr_destroy(&net->cells_dyn_ino);
net->live = false;
return ret;
}
@@ -151,10 +144,11 @@ static void __net_exit afs_net_exit(struct net *net_ns)
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ idr_destroy(&net->cells_dyn_ino);
kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 507c25a5b2cb..45cee6534122 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->force = true;
}
if (ctx->cell) {
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt);
ctx->cell = NULL;
}
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
return PTR_ERR(cell);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 12c88d8be3fe..40e879c8ca77 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true);
+ cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
goto done;
}
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin);
} else {
goto inval;
}
@@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
}
server = list_entry(v, struct afs_server, proc_link);
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
@@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
server->flags, server->rtt);
seq_printf(m, " - probe: last=%d\n",
(int)(jiffies - server->probed_at) / HZ);
+
+ estate = rcu_dereference(server->endpoint_state);
+ if (!estate)
+ goto out;
failed = estate->failed_set;
seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
estate->probe_seq, atomic_read(&estate->nr_probing),
estate->responsive_set, estate->failed_set);
+
+ alist = estate->addresses;
seq_printf(m, " - ALIST v=%u ap=%u\n",
alist->version, alist->addr_pref_version);
for (i = 0; i < alist->nr_addrs; i++) {
@@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
rxrpc_kernel_get_srtt(addr->peer),
addr->last_error, addr->prio);
}
+
+out:
return 0;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 886416ea1d96..d5e480a33859 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -179,7 +179,7 @@ static void afs_free_call(struct afs_call *call)
if (call->type->destructor)
call->type->destructor(call);
- afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
+ afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call);
kfree(call->request);
o = atomic_read(&net->nr_outstanding_calls);
@@ -766,8 +766,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long user_call_ID)
{
+ struct afs_call *call = (struct afs_call *)user_call_ID;
struct afs_net *net = afs_sock2net(sk);
+ call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall);
+ call->server = afs_find_server(call->peer);
+ if (!call->server)
+ trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer));
+
queue_work(afs_wq, &net->charge_preallocation_work);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 4504e16b458c..c530d1ca15df 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -14,190 +14,104 @@
static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */
static atomic_t afs_server_debug_id;
-static struct afs_server *afs_maybe_use_server(struct afs_server *,
- enum afs_server_trace);
static void __afs_put_server(struct afs_net *, struct afs_server *);
+static void afs_server_timer(struct timer_list *timer);
+static void afs_server_destroyer(struct work_struct *work);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server = NULL;
- unsigned int i;
- int seq = 1;
+ struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer);
- rcu_read_lock();
-
- do {
- if (server)
- afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
-
- hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
- for (i = 0; i < alist->nr_addrs; i++)
- if (alist->addrs[i].peer == peer)
- goto found;
- }
-
- server = NULL;
- continue;
- found:
- server = afs_maybe_use_server(server, afs_server_trace_get_by_addr);
-
- } while (need_seqretry(&net->fs_addr_lock, seq));
-
- done_seqretry(&net->fs_addr_lock, seq);
-
- rcu_read_unlock();
- return server;
+ if (!server)
+ return NULL;
+ return afs_use_server(server, false, afs_server_trace_use_cm_call);
}
/*
- * Look up a server by its UUID and mark it active.
+ * Look up a server by its UUID and mark it active. The caller must hold
+ * cell->fs_lock.
*/
-struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
+static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_server *server = NULL;
+ struct afs_server *server;
struct rb_node *p;
- int diff, seq = 1;
+ int diff;
_enter("%pU", uuid);
- do {
- /* Unfortunately, rbtree walking doesn't give reliable results
- * under just the RCU read lock, so we have to check for
- * changes.
- */
- if (server)
- afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_lock, &seq);
-
- p = net->fs_servers.rb_node;
- while (p) {
- server = rb_entry(p, struct afs_server, uuid_rb);
-
- diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
- if (diff < 0) {
- p = p->rb_left;
- } else if (diff > 0) {
- p = p->rb_right;
- } else {
- afs_use_server(server, afs_server_trace_get_by_uuid);
- break;
- }
-
- server = NULL;
- }
- } while (need_seqretry(&net->fs_lock, seq));
+ p = cell->fs_servers.rb_node;
+ while (p) {
+ server = rb_entry(p, struct afs_server, uuid_rb);
- done_seqretry(&net->fs_lock, seq);
+ diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+ if (diff < 0) {
+ p = p->rb_left;
+ } else if (diff > 0) {
+ p = p->rb_right;
+ } else {
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags))
+ return NULL; /* Need a write lock */
+ afs_use_server(server, true, afs_server_trace_use_by_uuid);
+ return server;
+ }
+ }
- _leave(" = %p", server);
- return server;
+ return NULL;
}
/*
- * Install a server record in the namespace tree. If there's a clash, we stick
- * it into a list anchored on whichever afs_server struct is actually in the
- * tree.
+ * Install a server record in the cell tree. The caller must hold an exclusive
+ * lock on cell->fs_lock.
*/
static struct afs_server *afs_install_server(struct afs_cell *cell,
- struct afs_server *candidate)
+ struct afs_server **candidate)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server, *next;
+ struct afs_server *server;
struct afs_net *net = cell->net;
struct rb_node **pp, *p;
int diff;
_enter("%p", candidate);
- write_seqlock(&net->fs_lock);
-
/* Firstly install the server in the UUID lookup tree */
- pp = &net->fs_servers.rb_node;
+ pp = &cell->fs_servers.rb_node;
p = NULL;
while (*pp) {
p = *pp;
_debug("- consider %p", p);
server = rb_entry(p, struct afs_server, uuid_rb);
- diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
- if (diff < 0) {
+ diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t));
+ if (diff < 0)
pp = &(*pp)->rb_left;
- } else if (diff > 0) {
+ else if (diff > 0)
pp = &(*pp)->rb_right;
- } else {
- if (server->cell == cell)
- goto exists;
-
- /* We have the same UUID representing servers in
- * different cells. Append the new server to the list.
- */
- for (;;) {
- next = rcu_dereference_protected(
- server->uuid_next,
- lockdep_is_held(&net->fs_lock.lock));
- if (!next)
- break;
- server = next;
- }
- rcu_assign_pointer(server->uuid_next, candidate);
- candidate->uuid_prev = server;
- server = candidate;
- goto added_dup;
- }
+ else
+ goto exists;
}
- server = candidate;
+ server = *candidate;
+ *candidate = NULL;
rb_link_node(&server->uuid_rb, p, pp);
- rb_insert_color(&server->uuid_rb, &net->fs_servers);
+ rb_insert_color(&server->uuid_rb, &cell->fs_servers);
+ write_seqlock(&net->fs_lock);
hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+ write_sequnlock(&net->fs_lock);
afs_get_cell(cell, afs_cell_trace_get_server);
-added_dup:
- write_seqlock(&net->fs_addr_lock);
- estate = rcu_dereference_protected(server->endpoint_state,
- lockdep_is_held(&net->fs_addr_lock.lock));
- alist = estate->addresses;
-
- /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
- * it in the IPv4 and/or IPv6 reverse-map lists.
- *
- * TODO: For speed we want to use something other than a flat list
- * here; even sorting the list in terms of lowest address would help a
- * bit, but anything we might want to do gets messy and memory
- * intensive.
- */
- if (alist->nr_addrs > 0)
- hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
-
- write_sequnlock(&net->fs_addr_lock);
-
exists:
- afs_get_server(server, afs_server_trace_get_install);
- write_sequnlock(&net->fs_lock);
+ afs_use_server(server, true, afs_server_trace_use_install);
return server;
}
/*
- * Allocate a new server record and mark it active.
+ * Allocate a new server record and mark it as active but uncreated.
*/
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
- const uuid_t *uuid,
- struct afs_addr_list *alist)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -205,65 +119,49 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
if (!server)
- goto enomem;
-
- estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
- if (!estate)
- goto enomem_server;
+ return NULL;
refcount_set(&server->ref, 1);
- atomic_set(&server->active, 1);
+ atomic_set(&server->active, 0);
+ __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
+ INIT_WORK(&server->destroyer, &afs_server_destroyer);
+ timer_setup(&server->timer, afs_server_timer, 0);
INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
+ INIT_HLIST_NODE(&server->proc_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
server->service_id = FS_SERVICE;
-
server->probe_counter = 1;
server->probed_at = jiffies - LONG_MAX / 2;
- refcount_set(&estate->ref, 1);
- estate->addresses = alist;
- estate->server_id = server->debug_id;
- estate->probe_seq = 1;
- rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
- trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
- trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
- afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
-
-enomem_server:
- kfree(server);
-enomem:
- _leave(" = NULL [nomem]");
- return NULL;
}
/*
* Look up an address record for a server
*/
-static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
- struct key *key, const uuid_t *uuid)
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server,
+ struct key *key)
{
struct afs_vl_cursor vc;
struct afs_addr_list *alist = NULL;
int ret;
ret = -ERESTARTSYS;
- if (afs_begin_vlserver_operation(&vc, cell, key)) {
+ if (afs_begin_vlserver_operation(&vc, server->cell, key)) {
while (afs_select_vlserver(&vc)) {
if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
- alist = afs_yfsvl_get_endpoints(&vc, uuid);
+ alist = afs_yfsvl_get_endpoints(&vc, &server->uuid);
else
- alist = afs_vl_get_addrs_u(&vc, uuid);
+ alist = afs_vl_get_addrs_u(&vc, &server->uuid);
}
ret = afs_end_vlserver_operation(&vc);
@@ -273,72 +171,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
}
/*
- * Get or create a fileserver record.
+ * Get or create a fileserver record and return it with an active-use count on
+ * it.
*/
struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
const uuid_t *uuid, u32 addr_version)
{
- struct afs_addr_list *alist;
- struct afs_server *server, *candidate;
+ struct afs_addr_list *alist = NULL;
+ struct afs_server *server, *candidate = NULL;
+ bool creating = false;
+ int ret;
_enter("%p,%pU", cell->net, uuid);
- server = afs_find_server_by_uuid(cell->net, uuid);
+ down_read(&cell->fs_lock);
+ server = afs_find_server_by_uuid(cell, uuid);
+ /* Won't see servers marked uncreated. */
+ up_read(&cell->fs_lock);
+
if (server) {
+ timer_delete_sync(&server->timer);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags))
+ goto wait_for_creation;
if (server->addr_version != addr_version)
set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
return server;
}
- alist = afs_vl_lookup_addrs(cell, key, uuid);
- if (IS_ERR(alist))
- return ERR_CAST(alist);
-
- candidate = afs_alloc_server(cell, uuid, alist);
+ candidate = afs_alloc_server(cell, uuid);
if (!candidate) {
afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
- server = afs_install_server(cell, candidate);
- if (server != candidate) {
- afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
+ down_write(&cell->fs_lock);
+ server = afs_install_server(cell, &candidate);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) {
+ /* We need to wait for creation to complete. */
+ up_write(&cell->fs_lock);
+ goto wait_for_creation;
+ }
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ set_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
+ }
+ up_write(&cell->fs_lock);
+ timer_delete_sync(&server->timer);
+
+ /* If we get to create the server, we look up the addresses and then
+ * immediately dispatch an asynchronous probe to each interface on the
+ * fileserver. This will make sure the repeat-probing service is
+ * started.
+ */
+ if (creating) {
+ alist = afs_vl_lookup_addrs(server, key);
+ if (IS_ERR(alist)) {
+ ret = PTR_ERR(alist);
+ goto create_failed;
+ }
+
+ ret = afs_fs_probe_fileserver(cell->net, server, alist, key);
+ if (ret)
+ goto create_failed;
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ }
+
+out:
+ afs_put_addrlist(alist, afs_alist_trace_put_server_create);
+ if (candidate) {
+ kfree(rcu_access_pointer(server->endpoint_state));
kfree(candidate);
- } else {
- /* Immediately dispatch an asynchronous probe to each interface
- * on the fileserver. This will make sure the repeat-probing
- * service is started.
- */
- afs_fs_probe_fileserver(cell->net, server, alist, key);
+ afs_dec_servers_outstanding(cell->net);
+ }
+ return server ?: ERR_PTR(ret);
+
+wait_for_creation:
+ afs_see_server(server, afs_server_trace_wait_create);
+ wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE);
+ if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ /* Barrier: read flag before error */
+ ret = READ_ONCE(server->create_error);
+ afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+ goto out;
}
- return server;
-}
+ ret = 0;
+ goto out;
-/*
- * Set the server timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_server_timer(struct afs_net *net, time64_t delay)
-{
- if (net->live) {
- afs_inc_servers_outstanding(net);
- if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
- afs_dec_servers_outstanding(net);
+create_failed:
+ down_write(&cell->fs_lock);
+
+ WRITE_ONCE(server->create_error, ret);
+ smp_wmb(); /* Barrier: set error before flag. */
+ set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
}
+ afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+
+ up_write(&cell->fs_lock);
+ goto out;
}
/*
- * Server management timer. We have an increment on fs_outstanding that we
- * need to pass along to the work item.
+ * Set/reduce a server's timer.
*/
-void afs_servers_timer(struct timer_list *timer)
+static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs)
{
- struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
-
- _enter("");
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ mod_timer(&server->timer, jiffies + delay_secs * HZ);
}
/*
@@ -357,32 +305,20 @@ struct afs_server *afs_get_server(struct afs_server *server,
}
/*
- * Try to get a reference on a server object.
+ * Get an active count on a server object and maybe remove from the inactive
+ * list.
*/
-static struct afs_server *afs_maybe_use_server(struct afs_server *server,
- enum afs_server_trace reason)
-{
- unsigned int a;
- int r;
-
- if (!__refcount_inc_not_zero(&server->ref, &r))
- return NULL;
-
- a = atomic_inc_return(&server->active);
- trace_afs_server(server->debug_id, r + 1, a, reason);
- return server;
-}
-
-/*
- * Get an active count on a server object.
- */
-struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason)
{
unsigned int a;
int r;
__refcount_inc(&server->ref, &r);
a = atomic_inc_return(&server->active);
+ if (a == 1 && activate &&
+ !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ del_timer(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
@@ -415,13 +351,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- unsigned int active = atomic_dec_return(&server->active);
+ if (!server)
+ return;
- if (active == 0)
- afs_set_server_timer(net, afs_server_gc_delay);
- afs_put_server(net, server, reason);
+ if (atomic_dec_and_test(&server->active)) {
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) ||
+ READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING)
+ schedule_work(&server->destroyer);
}
+
+ afs_put_server(net, server, reason);
}
/*
@@ -430,10 +369,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
void afs_unuse_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- server->unuse_time = ktime_get_real_seconds();
- afs_unuse_server_notime(net, server, reason);
+ if (!server)
+ return;
+
+ if (atomic_dec_and_test(&server->active)) {
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) &&
+ READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) {
+ time64_t unuse_time = ktime_get_real_seconds();
+
+ server->unuse_time = unuse_time;
+ afs_set_server_timer(server, afs_server_gc_delay);
+ } else {
+ schedule_work(&server->destroyer);
+ }
}
+
+ afs_put_server(net, server, reason);
}
static void afs_server_rcu(struct rcu_head *rcu)
@@ -463,159 +414,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
}
/*
- * destroy a dead server
+ * Check to see if the server record has expired.
*/
-static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
+static bool afs_has_server_expired(const struct afs_server *server)
{
- if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
- afs_give_up_callbacks(net, server);
+ time64_t expires_at;
- afs_put_server(net, server, afs_server_trace_destroy);
+ if (atomic_read(&server->active))
+ return false;
+
+ if (server->cell->net->live ||
+ server->cell->state >= AFS_CELL_REMOVING) {
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ 0, afs_server_trace_purging);
+ return true;
+ }
+
+ expires_at = server->unuse_time;
+ if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+ !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+ expires_at += afs_server_gc_delay;
+
+ return ktime_get_real_seconds() > expires_at;
}
/*
- * Garbage collect any expired servers.
+ * Remove a server record from it's parent cell's database.
*/
-static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+static bool afs_remove_server_from_cell(struct afs_server *server)
{
- struct afs_server *server, *next, *prev;
- int active;
-
- while ((server = gc_list)) {
- gc_list = server->gc_next;
-
- write_seqlock(&net->fs_lock);
-
- active = atomic_read(&server->active);
- if (active == 0) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_gc);
- next = rcu_dereference_protected(
- server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
- prev = server->uuid_prev;
- if (!prev) {
- /* The one at the front is in the tree */
- if (!next) {
- rb_erase(&server->uuid_rb, &net->fs_servers);
- } else {
- rb_replace_node_rcu(&server->uuid_rb,
- &next->uuid_rb,
- &net->fs_servers);
- next->uuid_prev = NULL;
- }
- } else {
- /* This server is not at the front */
- rcu_assign_pointer(prev->uuid_next, next);
- if (next)
- next->uuid_prev = prev;
- }
-
- list_del(&server->probe_link);
- hlist_del_rcu(&server->proc_link);
- if (!hlist_unhashed(&server->addr_link))
- hlist_del_rcu(&server->addr_link);
- }
- write_sequnlock(&net->fs_lock);
+ struct afs_cell *cell = server->cell;
+
+ down_write(&cell->fs_lock);
- if (active == 0)
- afs_destroy_server(net, server);
+ if (!afs_has_server_expired(server)) {
+ up_write(&cell->fs_lock);
+ return false;
}
+
+ set_bit(AFS_SERVER_FL_EXPIRED, &server->flags);
+ _debug("expire %pU %u", &server->uuid, atomic_read(&server->active));
+ afs_see_server(server, afs_server_trace_see_expired);
+ rb_erase(&server->uuid_rb, &cell->fs_servers);
+ up_write(&cell->fs_lock);
+ return true;
}
-/*
- * Manage the records of servers known to be within a network namespace. This
- * includes garbage collecting unused servers.
- *
- * Note also that we were given an increment on net->servers_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_servers(struct work_struct *work)
+static void afs_server_destroyer(struct work_struct *work)
{
- struct afs_net *net = container_of(work, struct afs_net, fs_manager);
- struct afs_server *gc_list = NULL;
- struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
-
- _enter("");
+ struct afs_endpoint_state *estate;
+ struct afs_server *server = container_of(work, struct afs_server, destroyer);
+ struct afs_net *net = server->cell->net;
- /* Trawl the server list looking for servers that have expired from
- * lack of use.
- */
- read_seqlock_excl(&net->fs_lock);
+ afs_see_server(server, afs_server_trace_see_destroyer);
- for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
- struct afs_server *server =
- rb_entry(cursor, struct afs_server, uuid_rb);
- int active = atomic_read(&server->active);
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ return;
- _debug("manage %pU %u", &server->uuid, active);
+ if (!afs_remove_server_from_cell(server))
+ return;
- if (purging) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_purging);
- if (active != 0)
- pr_notice("Can't purge s=%08x\n", server->debug_id);
- }
+ timer_shutdown_sync(&server->timer);
+ cancel_work(&server->destroyer);
- if (active == 0) {
- time64_t expire_at = server->unuse_time;
-
- if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
- !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
- expire_at += afs_server_gc_delay;
- if (purging || expire_at <= now) {
- server->gc_next = gc_list;
- gc_list = server;
- } else if (expire_at < next_manage) {
- next_manage = expire_at;
- }
- }
- }
+ if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+ afs_give_up_callbacks(net, server);
- read_sequnlock_excl(&net->fs_lock);
+ /* Unbind the rxrpc_peer records from the server. */
+ estate = rcu_access_pointer(server->endpoint_state);
+ if (estate)
+ afs_set_peer_appdata(server, estate->addresses, NULL);
- /* Update the timer on the way out. We have to pass an increment on
- * servers_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
+ write_seqlock(&net->fs_lock);
+ list_del_init(&server->probe_link);
+ if (!hlist_unhashed(&server->proc_link))
+ hlist_del_rcu(&server->proc_link);
+ write_sequnlock(&net->fs_lock);
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->fs_manager))
- afs_inc_servers_outstanding(net);
- } else {
- afs_set_server_timer(net, next_manage - now);
- }
- }
+ afs_put_server(net, server, afs_server_trace_destroy);
+}
- afs_gc_servers(net, gc_list);
+static void afs_server_timer(struct timer_list *timer)
+{
+ struct afs_server *server = container_of(timer, struct afs_server, timer);
- afs_dec_servers_outstanding(net);
- _leave(" [%d]", atomic_read(&net->servers_outstanding));
+ afs_see_server(server, afs_server_trace_see_timer);
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ schedule_work(&server->destroyer);
}
-static void afs_queue_server_manager(struct afs_net *net)
+/*
+ * Wake up all the servers in a cell so that they can purge themselves.
+ */
+void afs_purge_servers(struct afs_cell *cell)
{
- afs_inc_servers_outstanding(net);
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ struct afs_server *server;
+ struct rb_node *rb;
+
+ down_read(&cell->fs_lock);
+ for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) {
+ server = rb_entry(rb, struct afs_server, uuid_rb);
+ afs_see_server(server, afs_server_trace_see_purge);
+ schedule_work(&server->destroyer);
+ }
+ up_read(&cell->fs_lock);
}
/*
- * Purge list of servers.
+ * Wait for outstanding servers.
*/
-void afs_purge_servers(struct afs_net *net)
+void afs_wait_for_servers(struct afs_net *net)
{
_enter("");
- if (del_timer_sync(&net->fs_timer))
- afs_dec_servers_outstanding(net);
-
- afs_queue_server_manager(net);
-
- _debug("wait");
atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
@@ -639,7 +550,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
atomic_read(&server->active),
afs_server_trace_update);
- alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
+ alist = afs_vl_lookup_addrs(server, op->key);
if (IS_ERR(alist)) {
rcu_read_lock();
estate = rcu_dereference(server->endpoint_state);
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index d20cd902ef94..20d5474837df 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
if (slist && refcount_dec_and_test(&slist->usage)) {
for (i = 0; i < slist->nr_servers; i++)
afs_unuse_server(net, slist->servers[i].server,
- afs_server_trace_put_slist);
+ afs_server_trace_unuse_slist);
kfree_rcu(slist, rcu);
}
}
@@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_unuse_server(volume->cell->net, server,
- afs_server_trace_put_slist_isort);
+ afs_unuse_server_notime(volume->cell->net, server,
+ afs_server_trace_unuse_slist_isort);
continue;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a9bee610674e..25b306db6992 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
if (as->dyn_root)
seq_puts(m, ",dyn");
- if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
- seq_puts(m, ",autocell");
switch (as->flock_mode) {
case afs_flock_mode_unset: break;
case afs_flock_mode_local: p = "local"; break;
@@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false);
+ NULL, false,
+ afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse);
afs_see_cell(cell, afs_cell_trace_see_source);
ctx->cell = cell;
}
@@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = NULL;
cell = afs_use_cell(ctx->cell->alias_of,
afs_cell_trace_use_fc_alias);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
ctx->cell = cell;
goto reget_key;
}
@@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* allocate the root inode and dentry */
if (as->dyn_root) {
- inode = afs_iget_pseudo_dir(sb, true);
+ inode = afs_dynroot_iget_root(sb);
} else {
sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
@@ -478,9 +477,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (ctx->autocell || as->dyn_root)
- set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
-
ret = -ENOMEM;
sb->s_root = d_make_root(inode);
if (!sb->s_root)
@@ -488,9 +484,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
if (as->dyn_root) {
sb->s_d_op = &afs_dynroot_dentry_operations;
- ret = afs_dynroot_populate(sb);
- if (ret < 0)
- goto error;
} else {
sb->s_d_op = &afs_fs_dentry_operations;
rcu_assign_pointer(as->volume->sb, sb);
@@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
- struct afs_net *net = afs_net(as->net_ns);
afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
- afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
+ afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
}
@@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb)
{
struct afs_super_info *as = AFS_FS_S(sb);
- if (as->dyn_root)
- afs_dynroot_depopulate(sb);
-
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc)
afs_destroy_sbi(fc->s_fs_info);
afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f9e76b604f31..709b4cdb723e 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
goto is_alias;
if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
return -ERESTARTSYS;
}
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
}
mutex_unlock(&cell->net->proc_cells_lock);
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false);
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
return PTR_ERR(master);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index d8f79f6ada3d..6ad9688d8f4b 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
cell->dns_expiry <= ktime_get_real_seconds()) {
dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
- afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
+ afs_queue_cell(cell, afs_cell_trace_queue_dns);
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
if (wait_var_event_interruptible(
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index af3a3f57c1b3..0efff3d25133 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,6 +10,7 @@
#include "internal.h"
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static atomic_t afs_volume_debug_id;
static void afs_destroy_volume(struct work_struct *work);
@@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, refcount_read(&cell->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
if (!volume)
goto error_0;
+ volume->debug_id = atomic_inc_return(&afs_volume_debug_id);
volume->vid = vldb->vid[params->type];
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol);
@@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
*_slist = slist;
rcu_assign_pointer(volume->servers, slist);
- trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
+ trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc);
return volume;
error_1:
@@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, refcount_read(&volume->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
int r;
if (__refcount_inc_not_zero(&volume->ref, &r)) {
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
return true;
}
return false;
@@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
int r;
__refcount_inc(&volume->ref, &r);
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
}
return volume;
}
@@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
+ unsigned int debug_id = volume->debug_id;
afs_volid_t vid = volume->vid;
bool zero;
int r;
zero = __refcount_dec_and_test(&volume->ref, &r);
- trace_afs_volume(vid, r - 1, reason);
+ trace_afs_volume(debug_id, vid, r - 1, reason);
if (zero)
schedule_work(&volume->destructor);
}
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 77c7991d89aa..23cea74f9933 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *);
static inline int autofs_check_pipe(struct file *pipe)
{
+ if (pipe->f_mode & FMODE_PATH)
+ return -EINVAL;
if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 6d57efbb8110..c5a6aae12d2c 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
sbi->exp_timeout = timeout * HZ;
} else {
struct dentry *base = fp->f_path.dentry;
- struct inode *inode = base->d_inode;
int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
struct dentry *dentry;
struct autofs_info *ino;
@@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
"the parent autofs mount timeout which could "
"prevent shutdown\n");
- inode_lock_shared(inode);
dentry = try_lookup_one_len(param->path, base, path_len);
- inode_unlock_shared(inode);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 530d18827e35..174c7205fee4 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t);
static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
static long autofs_root_compat_ioctl(struct file *,
@@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
d_add(dentry, inode);
if (sbi->version < 5)
@@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inc_nlink(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- return 0;
+ return NULL;
}
/* Get/set timeout ioctl() operation */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 316d88da2ce1..0ef9bcb744dd 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap,
return -EIO;
}
-static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return -EIO;
+ return ERR_PTR(-EIO);
}
static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index fc7efd0a7525..c9798750202d 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -16,7 +16,7 @@ config BCACHEFS_FS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
select CRYPTO
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA256
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
select KEYS
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index d2689388d5e8..9af65079374f 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -41,7 +41,6 @@ bcachefs-y := \
extent_update.o \
eytzinger.o \
fs.o \
- fs-common.o \
fs-ioctl.o \
fs-io.o \
fs-io-buffered.o \
@@ -64,9 +63,11 @@ bcachefs-y := \
migrate.o \
move.o \
movinggc.o \
+ namei.o \
nocow_locking.o \
opts.o \
printbuf.o \
+ progress.o \
quota.o \
rebalance.o \
rcu_pending.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 3ea809990ef1..5fb396be9127 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
- c, alloc_v2_unpack_error,
+ c, alloc_v3_unpack_error,
"unpack error");
fsck_err:
return ret;
@@ -777,14 +777,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
s64 delta_sectors,
s64 delta_fragmented, unsigned flags)
{
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = data_type,
- };
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ d, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = data_type);
}
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
@@ -837,7 +835,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
if (!ca)
- return -EIO;
+ return -BCH_ERR_trigger_alloc;
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -871,6 +869,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (data_type_is_empty(new_a->data_type) &&
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+ if (new_a->oldest_gen == new_a->gen &&
+ !bch2_bucket_sectors_total(*new_a))
+ new_a->oldest_gen++;
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
alloc_data_type_set(new_a, new_a->data_type);
@@ -889,26 +890,20 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!new_a->io_time[READ])
new_a->io_time[READ] = bch2_current_io_time(c, READ);
- u64 old_lru = alloc_lru_idx_read(*old_a);
- u64 new_lru = alloc_lru_idx_read(*new_a);
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans, new.k->p.inode,
- bucket_to_u64(new.k->p),
- old_lru, new_lru);
- if (ret)
- goto err;
- }
+ ret = bch2_lru_change(trans, new.k->p.inode,
+ bucket_to_u64(new.k->p),
+ alloc_lru_idx_read(*old_a),
+ alloc_lru_idx_read(*new_a));
+ if (ret)
+ goto err;
- old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
- new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans,
- BCH_LRU_FRAGMENTATION_START,
- bucket_to_u64(new.k->p),
- old_lru, new_lru);
- if (ret)
- goto err;
- }
+ ret = bch2_lru_change(trans,
+ BCH_LRU_BUCKET_FRAGMENTATION,
+ bucket_to_u64(new.k->p),
+ alloc_lru_idx_fragmentation(*old_a, ca),
+ alloc_lru_idx_fragmentation(*new_a, ca));
+ if (ret)
+ goto err;
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
@@ -1034,7 +1029,7 @@ fsck_err:
invalid_bucket:
bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -EIO;
+ ret = -BCH_ERR_trigger_alloc;
goto err;
}
@@ -1705,7 +1700,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
if (lru_idx) {
- ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+ ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
+ bucket_to_u64(alloc_k.k->p),
lru_idx, alloc_k, last_flushed);
if (ret)
goto err;
@@ -1735,7 +1731,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = &a_mut->v;
}
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ a->io_time[READ],
alloc_k, last_flushed);
if (ret)
goto err;
@@ -1757,7 +1755,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
+ bch2_check_stripe_to_lru_refs(c);
bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
@@ -1805,6 +1804,19 @@ struct discard_buckets_state {
u64 discarded;
};
+/*
+ * This is needed because discard is both a filesystem option and a device
+ * option, and mount options are supposed to apply to that mount and not be
+ * persisted, i.e. if it's set as a mount option we can't propagate it to the
+ * device.
+ */
+static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
+{
+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
+ ? c->opts.discard
+ : ca->mi.discard;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca,
struct btree_iter *need_discard_iter,
@@ -1868,7 +1880,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
s->discarded++;
*discard_pos_done = iter.pos;
- if (ca->mi.discard && !c->opts.nochanges) {
+ if (discard_opt_enabled(c, ca) && !c->opts.nochanges) {
/*
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
@@ -1897,7 +1909,10 @@ commit:
if (ret)
goto out;
- count_event(c, bucket_discard);
+ if (!fastpath)
+ count_event(c, bucket_discard);
+ else
+ count_event(c, bucket_discard_fast);
out:
fsck_err:
if (discard_locked)
@@ -2055,16 +2070,71 @@ put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
+static int invalidate_one_bp(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
+{
+ struct btree_iter extent_iter;
+ struct bkey_s_c extent_k =
+ bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
+ int ret = bkey_err(extent_k);
+ if (ret)
+ return ret;
+
+ struct bkey_i *n =
+ bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
+err:
+ bch2_trans_iter_exit(trans, &extent_iter);
+ return ret;
+}
+
+static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bpos bucket,
+ u8 gen,
+ struct bkey_buf *last_flushed)
+{
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
+
+ return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
+ bp_start, bp_end, 0, k,
+ NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc, ({
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ if (bp.v->bucket_gen != gen)
+ continue;
+
+ /* filter out bps with gens that don't match */
+
+ invalidate_one_bp(trans, ca, bp, last_flushed);
+ }));
+}
+
+noinline_for_stack
static int invalidate_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
+ struct bkey_buf *last_flushed,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
- struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
- unsigned cached_sectors;
+ struct btree_iter alloc_iter = {};
int ret = 0;
if (*nr_to_invalidate <= 0)
@@ -2081,35 +2151,37 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
- ret = PTR_ERR_OR_ZERO(a);
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ BTREE_ITER_cached);
+ ret = bkey_err(alloc_k);
if (ret)
- goto out;
+ return ret;
+
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
/* We expect harmless races here due to the btree write buffer: */
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
goto out;
- BUG_ON(a->v.data_type != BCH_DATA_cached);
- BUG_ON(a->v.dirty_sectors);
+ /*
+ * Impossible since alloc_lru_idx_read() only returns nonzero if the
+ * bucket is supposed to be on the cached bucket LRU (i.e.
+ * BCH_DATA_cached)
+ *
+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
+ */
+ BUG_ON(a->data_type != BCH_DATA_cached);
+ BUG_ON(a->dirty_sectors);
- if (!a->v.cached_sectors)
+ if (!a->cached_sectors)
bch_err(c, "invalidating empty bucket, confused");
- cached_sectors = a->v.cached_sectors;
+ unsigned cached_sectors = a->cached_sectors;
+ u8 gen = a->gen;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- a->v.gen++;
- a->v.data_type = 0;
- a->v.dirty_sectors = 0;
- a->v.stripe_sectors = 0;
- a->v.cached_sectors = 0;
- a->v.io_time[READ] = bch2_current_io_time(c, READ);
- a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
-
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
if (ret)
goto out;
@@ -2117,6 +2189,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
--*nr_to_invalidate;
out:
fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
}
@@ -2143,6 +2216,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
@@ -2167,7 +2244,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (!k.k)
break;
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
restart_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -2180,6 +2257,7 @@ restart_err:
err:
bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
+ bch2_bkey_buf_exit(&last_flushed, c);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index de25ba4ee94b..c556ccaffe89 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
if (a.stripe)
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
if (bch2_bucket_sectors_dirty(a))
- return data_type;
+ return bucket_data_type(data_type);
if (a.cached_sectors)
return BCH_DATA_cached;
if (BCH_ALLOC_V4_NEED_DISCARD(&a))
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 5a781fb4c794..0cac65347a5d 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
void bch2_open_bucket_write_error(struct bch_fs *c,
struct open_buckets *obs,
- unsigned dev)
+ unsigned dev, int err)
{
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
if (ob->dev == dev && ob->ec)
- bch2_ec_bucket_cancel(c, ob);
+ bch2_ec_bucket_cancel(c, ob, err);
}
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
@@ -179,23 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
closure_wake_up(&c->freelist_wait);
}
-static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
-{
- switch (watermark) {
- case BCH_WATERMARK_interior_updates:
- return 0;
- case BCH_WATERMARK_reclaim:
- return OPEN_BUCKETS_COUNT / 6;
- case BCH_WATERMARK_btree:
- case BCH_WATERMARK_btree_copygc:
- return OPEN_BUCKETS_COUNT / 4;
- case BCH_WATERMARK_copygc:
- return OPEN_BUCKETS_COUNT / 3;
- default:
- return OPEN_BUCKETS_COUNT / 2;
- }
-}
-
static inline bool may_alloc_bucket(struct bch_fs *c,
struct bpos bucket,
struct bucket_alloc_state *s)
@@ -239,7 +222,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
spin_lock(&c->freelist_lock);
- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
@@ -648,7 +631,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
struct bch_dev_usage *usage)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -728,7 +711,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct bch_dev_usage usage;
struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
+ cl, flags & BCH_WRITE_alloc_nowait, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@@ -1336,7 +1319,7 @@ retry:
if (wp->data_type != BCH_DATA_user)
have_cache = true;
- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+ if (target && !(flags & BCH_WRITE_only_specified_devs)) {
ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
@@ -1426,7 +1409,7 @@ err:
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
ret = -BCH_ERR_bucket_alloc_blocked;
- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
+ if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
ret = -BCH_ERR_bucket_alloc_blocked;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index f25481a0d1a0..69ec6a012898 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -33,6 +33,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
return bch2_dev_have_ref(c, ob->dev);
}
+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
+{
+ switch (watermark) {
+ case BCH_WATERMARK_interior_updates:
+ return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
+ case BCH_WATERMARK_btree:
+ case BCH_WATERMARK_btree_copygc:
+ return OPEN_BUCKETS_COUNT / 4;
+ case BCH_WATERMARK_copygc:
+ return OPEN_BUCKETS_COUNT / 3;
+ default:
+ return OPEN_BUCKETS_COUNT / 2;
+ }
+}
+
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
enum bch_watermark, enum bch_data_type,
struct closure *);
@@ -65,7 +82,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
}
void bch2_open_bucket_write_error(struct bch_fs *,
- struct open_buckets *, unsigned);
+ struct open_buckets *, unsigned, int);
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4aa8ee026cb8..8f79f46c2a78 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -90,6 +90,7 @@ struct dev_stripe_state {
x(stopped) \
x(waiting_io) \
x(waiting_work) \
+ x(runnable) \
x(running)
enum write_point_state {
@@ -125,6 +126,7 @@ struct write_point {
enum write_point_state state;
u64 last_state_change;
u64 time[WRITE_POINT_STATE_NR];
+ u64 last_runtime;
} __aligned(SMP_CACHE_BYTES);
};
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ebeb6a5ff9d2..20c497f0c2cb 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -11,6 +11,7 @@
#include "checksum.h"
#include "disk_accounting.h"
#include "error.h"
+#include "progress.h"
#include <linux/mm.h>
@@ -49,6 +50,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
}
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+ prt_str(out, " data_type=");
+ bch2_prt_data_type(out, bp.v->data_type);
prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
(u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
bp.v->bucket_len,
@@ -244,27 +247,31 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
return bkey_s_c_null;
- if (likely(!bp.v->level)) {
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0, 0,
- iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
+ bch2_trans_node_iter_init(trans, iter,
+ bp.v->btree_id,
+ bp.v->pos,
+ 0,
+ bp.v->level,
+ iter_flags);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
- if (k.k &&
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
- return k;
+ if (k.k &&
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
+ return k;
- bch2_trans_iter_exit(trans, iter);
+ bch2_trans_iter_exit(trans, iter);
+
+ if (!bp.v->level) {
int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
+ return bkey_s_c_null;
if (IS_ERR_OR_NULL(b))
return ((struct bkey_s_c) { .k = ERR_CAST(b) });
@@ -514,6 +521,22 @@ check_existing_bp:
if (!other_extent.k)
goto missing;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
+ if (ca) {
+ struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
+ bkey_for_each_ptr(other_extent_ptrs, ptr)
+ if (ptr->dev == bp->k.p.inode &&
+ dev_ptr_stale_rcu(ca, ptr)) {
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+ other_extent, bp->k.p.inode);
+ if (ret)
+ goto err;
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
if (bch2_extents_match(orig_k, other_extent)) {
printbuf_reset(&buf);
prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
@@ -590,9 +613,6 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
continue;
@@ -600,9 +620,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
+
+ bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr));
rcu_read_unlock();
- if (check || empty) {
+ if ((check || empty) && !stale) {
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
@@ -715,71 +737,6 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
-struct progress_indicator_state {
- unsigned long next_print;
- u64 nodes_seen;
- u64 nodes_total;
- struct btree *last_node;
-};
-
-static inline void progress_init(struct progress_indicator_state *s,
- struct bch_fs *c,
- u64 btree_id_mask)
-{
- memset(s, 0, sizeof(*s));
-
- s->next_print = jiffies + HZ * 10;
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- if (!(btree_id_mask & BIT_ULL(i)))
- continue;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = i,
- };
-
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
- s->nodes_total += div64_ul(v, btree_sectors(c));
- }
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
- bool ret = time_after_eq(jiffies, s->next_print);
-
- if (ret)
- s->next_print = jiffies + HZ * 10;
- return ret;
-}
-
-static void progress_update_iter(struct btree_trans *trans,
- struct progress_indicator_state *s,
- struct btree_iter *iter,
- const char *msg)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
- s->nodes_seen += b != s->last_node;
- s->last_node = b;
-
- if (progress_update_p(s)) {
- struct printbuf buf = PRINTBUF;
- unsigned percent = s->nodes_total
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
- : 0;
-
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
- msg, percent, s->nodes_seen, s->nodes_total);
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-}
-
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
@@ -787,7 +744,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct progress_indicator_state progress;
int ret = 0;
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
@@ -806,7 +763,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -827,7 +784,7 @@ enum alloc_sector_counter {
ALLOC_SECTORS_NR
};
-static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
+static int data_type_to_alloc_counter(enum bch_data_type t)
{
switch (t) {
case BCH_DATA_btree:
@@ -836,9 +793,10 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
case BCH_DATA_cached:
return ALLOC_cached;
case BCH_DATA_stripe:
+ case BCH_DATA_parity:
return ALLOC_stripe;
default:
- BUG();
+ return -1;
}
}
@@ -889,7 +847,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (bp.v->bucket_gen != a->gen)
continue;
- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
+ if (alloc_counter < 0)
+ continue;
+
+ sectors[alloc_counter] += bp.v->bucket_len;
};
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -901,9 +863,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
goto err;
}
- /* Cached pointers don't have backpointers: */
-
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
+ sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
@@ -912,6 +873,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
}
if (sectors[ALLOC_dirty] > a->dirty_sectors ||
+ sectors[ALLOC_cached] > a->cached_sectors ||
sectors[ALLOC_stripe] > a->stripe_sectors) {
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
-BCH_ERR_transaction_restart_nested;
@@ -919,7 +881,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
}
if (!sectors[ALLOC_dirty] &&
- !sectors[ALLOC_stripe])
+ !sectors[ALLOC_stripe] &&
+ !sectors[ALLOC_cached])
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
else
__set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
@@ -1206,11 +1169,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k, ({
- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
check_one_backpointer(trans, start, end, k, &last_flushed);
}));
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 060dad1521ee..16575dbc5736 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#ifndef _BCACHEFS_BACKPOINTERS_H
+#define _BCACHEFS_BACKPOINTERS_H
#include "btree_cache.h"
#include "btree_iter.h"
@@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
return BCH_DATA_btree;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ if (p.has_ec)
+ return BCH_DATA_stripe;
+ if (p.ptr.cached)
+ return BCH_DATA_cached;
+ else
+ return BCH_DATA_user;
case KEY_TYPE_stripe: {
const struct bch_extent_ptr *ptr = &entry->ptr;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
@@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
struct bkey_i_backpointer *bp)
{
bkey_backpointer_init(&bp->k_i);
- bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
+ bp->k.p.inode = p.ptr.dev;
+
+ if (k.k->type != KEY_TYPE_stripe)
+ bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
+ else {
+ /*
+ * Put stripe backpointers where they won't collide with the
+ * extent backpointers within the stripe:
+ */
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
+ }
+
bp->v = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 161cf2f05d2a..f52311017aee 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -203,6 +203,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
+#include <linux/unicode.h>
#include "bcachefs_format.h"
#include "btree_journal_iter_types.h"
@@ -444,6 +445,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_sort) \
x(btree_node_read) \
x(btree_node_read_done) \
+ x(btree_node_write) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
@@ -456,6 +458,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
+ x(blocked_journal_max_open) \
x(blocked_key_cache_flush) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
@@ -533,6 +536,7 @@ struct bch_dev {
*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
+ unsigned long write_errors_start;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@@ -623,7 +627,8 @@ struct bch_dev {
x(topology_error) \
x(errors_fixed) \
x(errors_not_fixed) \
- x(no_invalid_checks)
+ x(no_invalid_checks) \
+ x(discard_mount_opt_set) \
enum bch_fs_flags {
#define x(n) BCH_FS_##n,
@@ -687,7 +692,8 @@ struct btree_trans_buf {
x(gc_gens) \
x(snapshot_delete_pagecache) \
x(sysfs) \
- x(btree_write_buffer)
+ x(btree_write_buffer) \
+ x(btree_node_scrub)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -696,6 +702,8 @@ enum bch_write_ref {
BCH_WRITE_REF_NR,
};
+#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
+
struct bch_fs {
struct closure cl;
@@ -780,6 +788,9 @@ struct bch_fs {
u64 btrees_lost_data;
} sb;
+#ifdef CONFIG_UNICODE
+ struct unicode_map *cf_encoding;
+#endif
struct bch_sb_handle disk_sb;
@@ -969,7 +980,6 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
- struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
@@ -993,15 +1003,11 @@ struct bch_fs {
wait_queue_head_t copygc_running_wq;
/* STRIPES: */
- GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
spinlock_t ec_stripes_new_lock;
- ec_stripes_heap ec_stripes_heap;
- struct mutex ec_stripes_heap_lock;
-
/* ERASURE CODING */
struct list_head ec_stripe_head_list;
struct mutex ec_stripe_head_lock;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f70f0108401f..e96d87767020 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -686,7 +686,12 @@ struct bch_sb_field_ext {
x(inode_depth, BCH_VERSION(1, 17)) \
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
x(autofix_errors, BCH_VERSION(1, 19)) \
- x(directory_size, BCH_VERSION(1, 20))
+ x(directory_size, BCH_VERSION(1, 20)) \
+ x(cached_backpointers, BCH_VERSION(1, 21)) \
+ x(stripe_backpointers, BCH_VERSION(1, 22)) \
+ x(stripe_lru, BCH_VERSION(1, 23)) \
+ x(casefolding, BCH_VERSION(1, 24)) \
+ x(extent_flags, BCH_VERSION(1, 25))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -837,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+/* one free bit */
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@@ -855,6 +861,8 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -908,7 +916,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(journal_no_flush, 16) \
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18) \
- x(incompat_version_field, 19)
+ x(incompat_version_field, 19) \
+ x(casefolding, 20)
#define BCH_SB_FEATURES_ALWAYS \
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
@@ -922,7 +931,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
BIT_ULL(BCH_FEATURE_new_siphash)| \
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
BIT_ULL(BCH_FEATURE_new_varint)| \
- BIT_ULL(BCH_FEATURE_journal_no_flush))
+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \
+ BIT_ULL(BCH_FEATURE_incompat_version_field))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 3c23bdf788ce..52594e925eb7 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -87,6 +87,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -215,6 +216,10 @@ struct bch_ioctl_data {
union {
struct {
__u32 dev;
+ __u32 data_types;
+ } scrub;
+ struct {
+ __u32 dev;
__u32 pad;
} migrate;
struct {
@@ -229,6 +234,11 @@ enum bch_data_event {
BCH_DATA_EVENT_NR = 1,
};
+enum data_progress_data_type_special {
+ DATA_PROGRESS_DATA_TYPE_phys = 254,
+ DATA_PROGRESS_DATA_TYPE_done = 255,
+};
+
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress {
__u64 sectors_done;
__u64 sectors_total;
+ __u64 sectors_error_corrected;
+ __u64 sectors_error_uncorrected;
} __packed __aligned(8);
+enum bch_ioctl_data_event_ret {
+ BCH_IOCTL_DATA_EVENT_RET_done = 1,
+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
+};
+
struct bch_ioctl_data_event {
__u8 type;
- __u8 pad[7];
+ __u8 ret;
+ __u8 pad[6];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting {
struct bkey_i_accounting accounting[];
};
+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
+
+struct bch_ioctl_query_counters {
+ __u16 nr;
+ __u16 flags;
+ __u32 pad;
+ __u64 d[];
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1ec1f90e0eb3..54666027aa85 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
btree_node_write_in_flight(b));
btree_node_data_free(bc, b);
+ cond_resched();
}
BUG_ON(!bch2_journal_error(&c->journal) &&
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dd1d9b74076e..ff681e733598 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -27,6 +27,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "progress.h"
#include "recovery_passes.h"
#include "reflink.h"
#include "recovery.h"
@@ -656,7 +657,9 @@ fsck_err:
return ret;
}
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
+static int bch2_gc_btree(struct btree_trans *trans,
+ struct progress_indicator_state *progress,
+ enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
@@ -673,6 +676,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
}));
@@ -717,22 +721,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- enum btree_id ids[BTREE_ID_NR];
struct printbuf buf = PRINTBUF;
- unsigned i;
int ret = 0;
- for (i = 0; i < BTREE_ID_NR; i++)
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, ~0ULL);
+
+ enum btree_id ids[BTREE_ID_NR];
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = bch2_gc_btree(trans, btree, true);
+ ret = bch2_gc_btree(trans, &progress, btree, true);
}
printbuf_exit(&buf);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 756736f9243d..2ba33ffc9795 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "bkey_sort.h"
#include "btree_cache.h"
@@ -1328,6 +1329,7 @@ static void btree_node_read_work(struct work_struct *work)
bch_info(c, "retrying read");
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
rb->have_ioref = ca != NULL;
+ rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1338,21 +1340,26 @@ static void btree_node_read_work(struct work_struct *work)
} else {
bio->bi_status = BLK_STS_REMOVED;
}
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
+
+ if (ca && bio->bi_status)
+ bch_err_dev_ratelimited(ca,
+ "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
- bch2_mark_io_failure(&failed, &rb->pick);
+ bch2_mark_io_failure(&failed, &rb->pick, false);
can_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
- &failed, &rb->pick) > 0;
+ &failed, &rb->pick, -1) > 0;
if (!bio->bi_status &&
!bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
@@ -1400,12 +1407,11 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
queue_work(c->btree_read_complete_wq, &rb->work);
}
@@ -1697,7 +1703,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
return;
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
- NULL, &pick);
+ NULL, &pick, -1);
if (ret <= 0) {
struct printbuf buf = PRINTBUF;
@@ -1811,6 +1817,190 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
}
+struct btree_node_scrub {
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ void *buf;
+ bool used_mempool;
+ unsigned written;
+
+ enum btree_id btree;
+ unsigned level;
+ struct bkey_buf key;
+ __le64 seq;
+
+ struct work_struct work;
+ struct bio bio;
+};
+
+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
+ struct printbuf *err)
+{
+ unsigned written = 0;
+
+ if (le64_to_cpu(data->magic) != bset_magic(c)) {
+ prt_printf(err, "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(data->magic));
+ return false;
+ }
+
+ while (written < (ptr_written ?: btree_sectors(c))) {
+ struct btree_node_entry *bne;
+ struct bset *i;
+ bool first = !written;
+
+ if (first) {
+ bne = NULL;
+ i = &data->keys;
+ } else {
+ bne = (void *) data + (written << 9);
+ i = &bne->keys;
+
+ if (!ptr_written && i->seq != data->keys.seq)
+ break;
+ }
+
+ struct nonce nonce = btree_nonce(i, written << 9);
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
+
+ if (first) {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
+ if (bch2_crc_cmp(data->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(data, c->block_bits);
+ } else {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ if (bch2_crc_cmp(bne->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return true;
+}
+
+static void btree_node_scrub_work(struct work_struct *work)
+{
+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
+ struct bch_fs *c = scrub->c;
+ struct printbuf err = PRINTBUF;
+
+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
+ bkey_i_to_s_c(scrub->key.k));
+ prt_newline(&err);
+
+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, scrub->btree,
+ scrub->key.k->k.p, 0, scrub->level - 1, 0);
+
+ struct btree *b;
+ int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
+ if (ret)
+ goto err;
+
+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
+ bch_err(c, "error validating btree node during scrub on %s at btree %s",
+ scrub->ca->name, err.buf);
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_begin(trans);
+ bch2_trans_put(trans);
+ }
+
+ printbuf_exit(&err);
+ bch2_bkey_buf_exit(&scrub->key, c);;
+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
+ percpu_ref_put(&scrub->ca->io_ref);
+ kfree(scrub);
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
+}
+
+static void btree_node_scrub_endio(struct bio *bio)
+{
+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
+
+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
+}
+
+int bch2_btree_node_scrub(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k, unsigned dev)
+{
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
+ return 0;
+
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub))
+ return -BCH_ERR_erofs_no_writes;
+
+ struct extent_ptr_decoded pick;
+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
+ if (ret <= 0)
+ goto err;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ if (!ca) {
+ ret = -BCH_ERR_device_offline;
+ goto err;
+ }
+
+ bool used_mempool = false;
+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
+
+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
+
+ struct btree_node_scrub *scrub =
+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
+ if (!scrub) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ scrub->c = c;
+ scrub->ca = ca;
+ scrub->buf = buf;
+ scrub->used_mempool = used_mempool;
+ scrub->written = btree_ptr_sectors_written(k);
+
+ scrub->btree = btree;
+ scrub->level = level;
+ bch2_bkey_buf_init(&scrub->key);
+ bch2_bkey_buf_reassemble(&scrub->key, c, k);
+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
+
+ INIT_WORK(&scrub->work, btree_node_scrub_work);
+
+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
+ scrub->bio.bi_end_io = btree_node_scrub_endio;
+ submit_bio(&scrub->bio);
+ return 0;
+err_free:
+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
+ percpu_ref_put(&ca->io_ref);
+err:
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
+ return ret;
+}
+
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
@@ -1831,7 +2021,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new;
@@ -1839,6 +2029,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
bch2_btree_complete_write(c, b, w);
+ if (start_time)
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
+
old = READ_ONCE(b->flags);
do {
new = old;
@@ -1869,7 +2062,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -1877,7 +2070,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
/* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, start_time);
six_unlock_read(&b->c.lock);
}
@@ -1887,6 +2080,7 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
+ u64 start_time = wbio->start_time;
int ret = 0;
btree_bounce_free(c,
@@ -1919,12 +2113,18 @@ static void btree_node_write_work(struct work_struct *work)
}
out:
bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b);
+ btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
+
+ if (!bch2_err_matches(ret, EROFS)) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
goto out;
}
@@ -1937,16 +2137,21 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
- unsigned long flags;
- if (wbio->have_ioref)
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
- if (!ca ||
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- "btree write error: %s",
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("btree")) {
+ if (ca && bio->bi_status) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "btree write error: %s\n ",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ if (bio->bi_status) {
+ unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
@@ -2023,6 +2228,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
+ u64 start_time = local_clock();
int ret;
if (flags & BTREE_WRITE_ALREADY_STARTED)
@@ -2231,6 +2437,7 @@ do_write:
wbio->data = data;
wbio->data_bytes = bytes;
wbio->sector_offset = b->written;
+ wbio->start_time = start_time;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
@@ -2258,7 +2465,7 @@ err:
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, 0);
}
/*
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 6f9e4a6dacf7..dbf76d22c660 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -52,6 +52,7 @@ struct btree_write_bio {
void *data;
unsigned data_bytes;
unsigned sector_offset;
+ u64 start_time;
struct bch_write_bio wbio;
};
@@ -132,6 +133,9 @@ void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, unsigned);
+
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
enum btree_write_flags {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index e32fce4fd258..7542c6f9c88e 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -562,20 +562,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_peek(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->key.k.p;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b96157f3dc9c..8823eec6b284 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -335,13 +335,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra
}
__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
{
BUG_ON(err <= 0);
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
trans->restarted = err;
trans->last_restarted_ip = ip;
+ return -err;
+}
+
+__always_inline
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+{
+ btree_trans_restart_foreign_task(trans, err, ip);
#ifdef CONFIG_BCACHEFS_DEBUG
darray_exit(&trans->last_restarted_trace);
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index caef65adeae4..94eb2b73a843 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -91,10 +91,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
- struct task_struct *task = i->trans->locking_wait.task;
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", task ?task->pid : 0);
+ prt_printf(out, "%u ", task ? task->pid : 0);
}
prt_newline(out);
}
@@ -172,7 +172,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
trace_would_deadlock(g, i->trans);
- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ return btree_trans_restart_foreign_task(i->trans,
+ BCH_ERR_transaction_restart_would_deadlock,
+ _THIS_IP_);
} else {
i->trans->lock_must_abort = true;
wake_up_process(i->trans->locking_wait.task);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index a7f06deee13c..678161321e42 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -166,11 +166,17 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, bn, PAGE_SIZE);
+ u64 submit_time = local_clock();
submit_bio_wait(bio);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status)))
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status));
return;
+ }
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
@@ -264,7 +270,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref);
closure_put(w->cl);
kfree(w);
return 0;
@@ -283,29 +289,28 @@ static int read_btree_nodes(struct find_btree_nodes *f)
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- struct task_struct *t;
-
if (!w) {
percpu_ref_put(&ca->io_ref);
ret = -ENOMEM;
goto err;
}
- percpu_ref_get(&ca->io_ref);
- closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
- closure_put(&cl);
- f->ret = ret;
- bch_err(c, "error starting kthread: %i", ret);
+ kfree(w);
+ bch_err_msg(c, ret, "starting kthread");
break;
}
+
+ closure_get(&cl);
+ percpu_ref_get(&ca->io_ref);
+ wake_up_process(t);
}
err:
closure_sync(&cl);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index c4f524b2ca9a..7d7e52ddde02 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -164,6 +164,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+ kmsan_check_memory(insert, bkey_bytes(&insert->k));
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -336,6 +337,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->cached != path->cached);
BUG_ON(i->level != path->level);
BUG_ON(i->btree_id != path->btree_id);
+ BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
@@ -517,69 +519,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
}
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- unsigned *btree_id_updates_start)
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- bool trans_trigger_run;
+ unsigned sort_id_start = 0;
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
+ while (sort_id_start < trans->nr_updates) {
+ unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
+ bool trans_trigger_run;
- for (unsigned i = *btree_id_updates_start;
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
- i++) {
- if (trans->updates[i].btree_id < btree_id) {
- *btree_id_updates_start = i;
- continue;
+ /*
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being
+ * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
+ * references before they are re-added.
+ *
+ * Running triggers will append more updates to the list of
+ * updates as we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = sort_id_start;
+ i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
+ i++) {
+ if (trans->updates[i].sort_order < sort_id) {
+ sort_id_start = i;
+ continue;
+ }
+
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
}
+ } while (trans_trigger_run);
- int ret = run_one_trans_trigger(trans, trans->updates + i);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
-
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
- i->btree_id == btree_id &&
- btree_node_type_has_trans_triggers(i->bkey_type) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
- return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- unsigned btree_id = 0, btree_id_updates_start = 0;
- int ret = 0;
-
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- if (btree_id == BTREE_ID_alloc)
- continue;
-
- ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
- if (ret)
- return ret;
+ sort_id_start = i;
}
- btree_id_updates_start = 0;
- ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
- if (ret)
- return ret;
-
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
@@ -903,18 +881,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
struct bch_fs *c = trans->c;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- switch (ret) {
- case -BCH_ERR_btree_insert_btree_node_full:
- ret = bch2_btree_split_leaf(trans, i->path, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans,
- trace_ip, trans->paths + i->path);
- break;
- case -BCH_ERR_btree_insert_need_mark_replicas:
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb(trans));
- break;
- case -BCH_ERR_journal_res_get_blocked:
+ if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
/*
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
@@ -922,13 +889,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
- break;
+ goto out;
}
ret = drop_locks_do(trans,
bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_CHECK));
+ goto out;
+ }
+
+ switch (ret) {
+ case -BCH_ERR_btree_insert_btree_node_full:
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ trace_and_count(c, trans_restart_btree_node_split, trans,
+ trace_ip, trans->paths + i->path);
+ break;
+ case -BCH_ERR_btree_insert_need_mark_replicas:
+ ret = drop_locks_do(trans,
+ bch2_accounting_update_sb(trans));
break;
case -BCH_ERR_btree_insert_need_journal_reclaim:
bch2_trans_unlock(trans);
@@ -950,7 +930,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(ret >= 0);
break;
}
-
+out:
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a09cbe9cd94f..77578da2d23f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -423,6 +423,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
struct btree_insert_entry {
unsigned flags;
+ u8 sort_order;
u8 bkey_type;
enum btree_id btree_id:8;
u8 level:4;
@@ -853,6 +854,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree)
return BIT_ULL(btree) & mask;
}
+static inline u8 btree_trigger_order(enum btree_id btree)
+{
+ switch (btree) {
+ case BTREE_ID_alloc:
+ return U8_MAX;
+ case BTREE_ID_stripes:
+ return U8_MAX - 1;
+ default:
+ return btree;
+ }
+}
+
struct btree_root {
struct btree *b;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 13d794f201a5..bd2eb42edb24 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -17,7 +17,7 @@
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
- return cmp_int(l->btree_id, r->btree_id) ?:
+ return cmp_int(l->sort_order, r->sort_order) ?:
cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
@@ -397,6 +397,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
n = (struct btree_insert_entry) {
.flags = flags,
+ .sort_order = btree_trigger_order(path->btree_id),
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,
@@ -511,6 +512,8 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 47d8690f01bf..d2e1c04353f6 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -133,6 +133,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
enum btree_id btree,
struct bkey_i *k)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
if (unlikely(!btree_type_uses_write_buffer(btree))) {
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
dump_stack();
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e4e7c804625e..67f1e3202835 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -649,6 +649,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
return 0;
}
+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
+{
+ struct btree_node *b_data = READ_ONCE(b->data);
+
+ return (b_data ? b_data->keys.seq : 0) == seq;
+}
+
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
@@ -677,17 +685,9 @@ static void btree_update_nodes_written(struct btree_update *as)
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- __le64 seq;
-
b = as->old_nodes[i];
- bch2_trans_begin(trans);
- btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- seq = b->data ? b->data->keys.seq : 0;
- six_unlock_read(&b->c.lock);
- bch2_trans_unlock_long(trans);
-
- if (seq == as->old_nodes_seq[i])
+ if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -2126,6 +2126,31 @@ err_free_update:
goto out;
}
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b)
+{
+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_intent);
+ int ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
+ /* has node been freed? */
+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
+ /* node has been freed: */
+ BUG_ON(!btree_node_dying(b));
+ ret = -BCH_ERR_btree_node_dying;
+ goto err;
+ }
+
+ BUG_ON(!btree_node_hashed(b));
+ return 0;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
@@ -2191,66 +2216,78 @@ err:
goto out;
}
-struct async_btree_rewrite {
- struct bch_fs *c;
- struct work_struct work;
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- struct bkey_buf key;
-};
-
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
- struct async_btree_rewrite *a)
+static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_i *k, unsigned flags)
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter,
- a->btree_id, a->key.k->k.p,
- BTREE_MAX_DEPTH, a->level, 0);
+ btree, k->k.p,
+ BTREE_MAX_DEPTH, level, 0);
struct btree *b = bch2_btree_iter_peek_node(&iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, 0)
+ ? bch2_btree_node_rewrite(trans, &iter, b, flags)
: -ENOENT;
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
-#if 0
- /* Tracepoint... */
- if (!ret || ret == -ENOENT) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bpos pos, unsigned flags)
+{
+ BUG_ON(!level);
- if (!ret) {
- prt_printf(&buf, "rewrite node:\n ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- } else {
- prt_printf(&buf, "node to rewrite not found:\n want: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- prt_printf(&buf, "\n got: ");
- if (b)
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- else
- prt_str(&buf, "(null)");
- }
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-#endif
-out:
+ /* Traverse one depth lower to get a pointer to the node itself: */
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
+ struct btree *b, unsigned flags)
+{
+ struct btree_iter iter;
+ int ret = get_iter_to_node(trans, &iter, b);
+ if (ret)
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+struct async_btree_rewrite {
+ struct bch_fs *c;
+ struct work_struct work;
+ struct list_head list;
+ enum btree_id btree_id;
+ unsigned level;
+ struct bkey_buf key;
+};
+
static void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
+ a->btree_id, a->level, a->key.k, 0));
if (ret != -ENOENT)
bch_err_fn_ratelimited(c, ret);
@@ -2494,30 +2531,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
unsigned commit_flags, bool skip_triggers)
{
struct btree_iter iter;
- int ret;
-
- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter);
+ int ret = get_iter_to_node(trans, &iter, b);
if (ret)
- goto out;
-
- /* has node been freed? */
- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- goto out;
- }
-
- BUG_ON(!btree_node_hashed(b));
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
-out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 26d646e1275c..be71cd73b864 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -169,7 +169,14 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
struct btree *, unsigned);
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bpos, unsigned);
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
+ struct btree *, unsigned);
+
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
struct btree *, struct bkey_i *,
unsigned, bool);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 345b117a4a4a..e56ef623ebc1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -590,11 +590,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (ret)
goto err;
- if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
- if (ret)
- goto err;
- }
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
+ if (ret)
+ goto err;
}
if (flags & BTREE_TRIGGER_gc) {
@@ -674,10 +672,10 @@ err:
return -BCH_ERR_ENOMEM_mark_stripe_ptr;
}
- mutex_lock(&c->ec_stripes_heap_lock);
+ gc_stripe_lock(m);
if (!m || !m->alive) {
- mutex_unlock(&c->ec_stripes_heap_lock);
+ gc_stripe_unlock(m);
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
@@ -693,7 +691,7 @@ err:
.type = BCH_DISK_ACCOUNTING_replicas,
};
memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
- mutex_unlock(&c->ec_stripes_heap_lock);
+ gc_stripe_unlock(m);
acc.replicas.data_type = data_type;
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
@@ -726,9 +724,7 @@ static int __trigger_extent(struct btree_trans *trans,
.replicas.nr_required = 1,
};
- struct disk_accounting_pos acct_compression_key = {
- .type = BCH_DISK_ACCOUNTING_compression,
- };
+ unsigned cur_compression_type = 0;
u64 compression_acct[3] = { 1, 0, 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -762,13 +758,13 @@ static int __trigger_extent(struct btree_trans *trans,
acc_replicas_key.replicas.nr_required = 0;
}
- if (acct_compression_key.compression.type &&
- acct_compression_key.compression.type != p.crc.compression_type) {
+ if (cur_compression_type &&
+ cur_compression_type != p.crc.compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
@@ -777,7 +773,7 @@ static int __trigger_extent(struct btree_trans *trans,
compression_acct[2] = 0;
}
- acct_compression_key.compression.type = p.crc.compression_type;
+ cur_compression_type = p.crc.compression_type;
if (p.crc.compression_type) {
compression_acct[1] += p.crc.uncompressed_size;
compression_acct[2] += p.crc.compressed_size;
@@ -791,45 +787,34 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- struct disk_accounting_pos acc_snapshot_key = {
- .type = BCH_DISK_ACCOUNTING_snapshot,
- .snapshot.id = k.k->p.snapshot,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
if (ret)
return ret;
}
- if (acct_compression_key.compression.type) {
+ if (cur_compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
}
if (level) {
- struct disk_accounting_pos acc_btree_key = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = btree_id,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
if (ret)
return ret;
} else {
bool insert = !(flags & BTREE_TRIGGER_overwrite);
- struct disk_accounting_pos acc_inum_key = {
- .type = BCH_DISK_ACCOUNTING_inum,
- .inum.inum = k.k->p.inode,
- };
+
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
*replicas_sectors,
};
- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
if (ret)
return ret;
}
@@ -878,15 +863,15 @@ int bch2_trigger_extent(struct btree_trans *trans,
}
int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta = 0;
+ s64 need_rebalance_sectors_delta[1] = { 0 };
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta -= s;
+ need_rebalance_sectors_delta[0] -= s;
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta += s;
+ need_rebalance_sectors_delta[0] += s;
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
@@ -895,12 +880,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
return ret;
}
- if (need_rebalance_sectors_delta) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
- };
- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
- flags & BTREE_TRIGGER_gc);
+ if (need_rebalance_sectors_delta[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ need_rebalance_sectors_delta, rebalance_work);
if (ret)
return ret;
}
@@ -916,17 +898,13 @@ static int __trigger_reservation(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 sectors = k.k->size;
+ s64 sectors[1] = { k.k->size };
if (flags & BTREE_TRIGGER_overwrite)
- sectors = -sectors;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
- };
+ sectors[0] = -sectors[0];
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
}
return 0;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a9acdd6c0c86..c5363256e363 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -39,33 +39,6 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- * while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR 0
-#else
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
- ulong ulong;
- u8 byte;
-};
-
static inline void bucket_unlock(struct bucket *b)
{
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
@@ -167,9 +140,7 @@ static inline int gen_cmp(u8 a, u8 b)
static inline int gen_after(u8 a, u8 b)
{
- int r = gen_cmp(a, b);
-
- return r > 0 ? r : 0;
+ return max(0, gen_cmp(a, b));
}
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 7174047b8e92..900b8680c8b5 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -7,6 +7,33 @@
#define BUCKET_JOURNAL_SEQ_BITS 16
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ * while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR 0
+#else
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+ ulong ulong;
+ u8 byte;
+};
+
struct bucket {
u8 lock;
u8 gen_valid:1;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 46e9e32105a9..57d55b3ddc71 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -11,6 +11,7 @@
#include "move.h"
#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-counters.h"
#include "super-io.h"
#include "thread_with_file.h"
@@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg)
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
- ctx->stats.data_type = U8_MAX;
+ if (ctx->thr.ret == -BCH_ERR_device_offline)
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
+ else {
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
+ }
return 0;
}
@@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
- .type = BCH_DATA_EVENT_PROGRESS,
- .p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.pos.btree,
- .p.pos = ctx->stats.pos.pos,
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .p.sectors_total = bch2_fs_usage_read_short(c).used,
+ .type = BCH_DATA_EVENT_PROGRESS,
+ .ret = ctx->stats.ret,
+ .p.data_type = ctx->stats.data_type,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
};
+ if (ctx->arg.op == BCH_DATA_OP_scrub) {
+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
+ if (ca) {
+ struct bch_dev_usage u;
+ bch2_dev_usage_read_fast(ca, &u);
+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
+ if (ctx->arg.scrub.data_types & BIT(i))
+ e.p.sectors_total += u.d[i].sectors;
+ bch2_dev_put(ca);
+ }
+ } else {
+ e.p.sectors_total = bch2_fs_usage_read_short(c).used;
+ }
+
if (len < sizeof(e))
return -EINVAL;
@@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
case BCH_IOCTL_QUERY_ACCOUNTING:
return bch2_ioctl_query_accounting(c, arg);
+ case BCH_IOCTL_QUERY_COUNTERS:
+ return bch2_ioctl_query_counters(c, arg);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 23a383577d4c..3726689093e3 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -466,7 +466,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, ")");
WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return -BCH_ERR_recompute_checksum;
}
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
@@ -693,6 +693,14 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
return 0;
}
+#if 0
+
+/*
+ * This seems to be duplicating code in cmd_remove_passphrase() in
+ * bcachefs-tools, but we might want to switch userspace to use this - and
+ * perhaps add an ioctl for calling this at runtime, so we can take the
+ * passphrase off of a mounted filesystem (which has come up).
+ */
int bch2_disable_encryption(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
@@ -725,6 +733,10 @@ out:
return ret;
}
+/*
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
+ * should be
+ */
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{
struct bch_encrypted_key key;
@@ -781,6 +793,7 @@ err:
memzero_explicit(&key, sizeof(key));
return ret;
}
+#endif
void bch2_fs_encryption_exit(struct bch_fs *c)
{
@@ -788,8 +801,6 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
crypto_free_shash(c->poly1305);
if (c->chacha20)
crypto_free_sync_skcipher(c->chacha20);
- if (c->sha256)
- crypto_free_shash(c->sha256);
}
int bch2_fs_encryption_init(struct bch_fs *c)
@@ -798,14 +809,6 @@ int bch2_fs_encryption_init(struct bch_fs *c)
struct bch_key key;
int ret = 0;
- c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->sha256);
- if (ret) {
- c->sha256 = NULL;
- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
- goto out;
- }
-
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 43b9d71f2f2b..4ac251c8fcd8 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -103,8 +103,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
+#if 0
int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool);
+#endif
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 114bf2f3879f..85fc90342492 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
void *workspace;
- int ret;
+ int ret = 0, ret2;
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
mempool_t *workspace_pool = &c->compress_workspace[opt];
@@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
else
ret = -BCH_ERR_compression_workspace_not_initialized;
if (ret)
- goto out;
+ goto err;
}
src_data = bio_map_or_bounce(c, src, READ);
@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
switch (crc.compression_type) {
case BCH_COMPRESSION_TYPE_lz4_old:
case BCH_COMPRESSION_TYPE_lz4:
- ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
- src_len, dst_len, dst_len);
- if (ret != dst_len)
- goto err;
+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
+ src_len, dst_len, dst_len);
+ if (ret2 != dst_len)
+ ret = -BCH_ERR_decompress_lz4;
break;
case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
- ret = zlib_inflate(&strm, Z_FINISH);
+ ret2 = zlib_inflate(&strm, Z_FINISH);
mempool_free(workspace, workspace_pool);
- if (ret != Z_STREAM_END)
- goto err;
+ if (ret2 != Z_STREAM_END)
+ ret = -BCH_ERR_decompress_gzip;
break;
}
case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_DCtx *ctx;
size_t real_src_len = le32_to_cpup(src_data.b);
- if (real_src_len > src_len - 4)
+ if (real_src_len > src_len - 4) {
+ ret = -BCH_ERR_decompress_zstd_src_len_bad;
goto err;
+ }
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = zstd_decompress_dctx(ctx,
+ ret2 = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
mempool_free(workspace, workspace_pool);
- if (ret != dst_len)
- goto err;
+ if (ret2 != dst_len)
+ ret = -BCH_ERR_decompress_zstd;
break;
}
default:
BUG();
}
- ret = 0;
+err:
fsck_err:
-out:
bio_unmap_or_unbounce(c, src_data);
return ret;
-err:
- ret = -EIO;
- goto out;
}
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
- crc->compressed_size << 9 > c->opts.encoded_extent_max) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error rewriting existing data: extent too big");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EIO;
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
+ bch2_write_op_error(op, op->pos.offset,
+ "extent too big to decompress (%u > %u)",
+ crc->uncompressed_size << 9, c->opts.encoded_extent_max);
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
}
data = __bounce_alloc(c, dst_len, WRITE);
- if (__bio_uncompress(c, bio, data.b, *crc)) {
- if (!c->opts.no_data_io) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error rewriting existing data: decompression error");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- ret = -EIO;
+ ret = __bio_uncompress(c, bio, data.b, *crc);
+
+ if (c->opts.no_data_io)
+ ret = 0;
+
+ if (ret) {
+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
goto err;
}
@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -EIO;
+ return -BCH_ERR_decompress_exceeded_max_encoded_extent;
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 642fbc60ecab..0ec273daccb7 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -20,6 +20,8 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/ioprio.h>
+
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- if (!bch2_dev_tryget(c, ptr->dev)) {
+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
@@ -91,7 +93,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
return true;
}
-static noinline void trace_move_extent_finish2(struct data_update *u,
+static noinline void trace_io_move_finish2(struct data_update *u,
struct bkey_i *new,
struct bkey_i *insert)
{
@@ -111,11 +113,11 @@ static noinline void trace_move_extent_finish2(struct data_update *u,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
- trace_move_extent_finish(c, buf.buf);
+ trace_io_move_finish(c, buf.buf);
printbuf_exit(&buf);
}
-static void trace_move_extent_fail2(struct data_update *m,
+static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
@@ -126,7 +128,7 @@ static void trace_move_extent_fail2(struct data_update *m,
struct printbuf buf = PRINTBUF;
unsigned rewrites_found = 0;
- if (!trace_move_extent_fail_enabled())
+ if (!trace_io_move_fail_enabled())
return;
prt_str(&buf, msg);
@@ -166,7 +168,7 @@ static void trace_move_extent_fail2(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
}
- trace_move_extent_fail(c, buf.buf);
+ trace_io_move_fail(c, buf.buf);
printbuf_exit(&buf);
}
@@ -214,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
new = bkey_i_to_extent(bch2_keylist_front(keys));
if (!bch2_extents_match(k, old)) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
NULL, "no match:");
goto nowork;
}
@@ -254,7 +256,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (m->data_opts.rewrite_ptrs &&
!rewrites_found &&
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
goto nowork;
}
@@ -271,7 +273,7 @@ restart_drop_conflicting_replicas:
}
if (!bkey_val_u64s(&new->k)) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
goto nowork;
}
@@ -352,7 +354,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
- ret = -EIO;
+ ret = -BCH_ERR_invalid_bkey;
goto out;
}
@@ -385,9 +387,9 @@ restart_drop_extra_replicas:
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
- if (trace_move_extent_finish_enabled())
- trace_move_extent_finish2(m, &new->k_i, insert);
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
+ if (trace_io_move_finish_enabled())
+ trace_io_move_finish2(m, &new->k_i, insert);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -409,7 +411,7 @@ nowork:
&m->stats->sectors_raced);
}
- count_event(c, move_extent_fail);
+ count_event(c, io_move_fail);
bch2_btree_iter_advance(&iter);
goto next;
@@ -427,14 +429,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
}
-void bch2_data_update_read_done(struct data_update *m,
- struct bch_extent_crc_unpacked crc)
+void bch2_data_update_read_done(struct data_update *m)
{
+ m->read_done = true;
+
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
- m->op.crc = crc;
- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+ m->op.crc = m->rbio.pick.crc;
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
}
@@ -444,31 +449,34 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+ kfree(update->bvecs);
+ update->bvecs = NULL;
+
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);
bkey_put_dev_refs(c, k);
- bch2_bkey_buf_exit(&update->k, c);
bch2_disk_reservation_put(c, &update->op.res);
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+ bch2_bkey_buf_exit(&update->k, c);
}
-static void bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
+static int bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
{
struct bch_fs *c = update->op.c;
- struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
closure_init_stack(&cl);
bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
- while (bio_sectors(bio)) {
- unsigned sectors = bio_sectors(bio);
+ while (bpos_lt(update->op.pos, update->k.k->k.p)) {
+ unsigned sectors = update->k.k->k.p.offset -
+ update->op.pos.offset;
bch2_trans_begin(trans);
@@ -504,7 +512,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch_err_fn_ratelimited(c, ret);
if (ret)
- return;
+ break;
sectors = min(sectors, wp->sectors_free);
@@ -514,7 +522,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
bch2_alloc_sectors_done(c, wp);
- bio_advance(bio, sectors << 9);
update->op.pos.offset += sectors;
extent_for_each_ptr(extent_i_to_s(e), ptr)
@@ -533,13 +540,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_trans_unlock(trans);
closure_sync(&cl);
}
+
+ return ret;
}
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- printbuf_tabstop_push(out, 20);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
prt_str_indented(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
@@ -574,6 +584,17 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
}
+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
+{
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+ prt_printf(out, "read_done:\t\%u\n", m->read_done);
+ bch2_write_op_to_text(out, &m->op);
+ printbuf_indent_sub(out, 2);
+}
+
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -617,12 +638,85 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ /* write path might have to decompress data: */
+ unsigned buf_bytes = 0;
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
+
+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
+ if (!m->bvecs)
+ return -ENOMEM;
+
+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
+
+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
+ kfree(m->bvecs);
+ m->bvecs = NULL;
+ return -ENOMEM;
+ }
+
+ rbio_init(&m->rbio.bio, c, *io_opts, NULL);
+ m->rbio.data_update = true;
+ m->rbio.bio.bi_iter.bi_size = buf_bytes;
+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+ return 0;
+}
+
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
+{
+ if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
+ return -BCH_ERR_data_update_done_would_block;
+
+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
+ ? m->op.target
+ : 0;
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
+
+ darray_for_each(m->op.devs_have, i)
+ __clear_bit(*i, devs.d);
+
+ rcu_read_lock();
+ unsigned nr_replicas = 0, i;
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+ struct bch_dev *ca = bch2_dev_rcu(c, i);
+
+ struct bch_dev_usage usage;
+ bch2_dev_usage_read_fast(ca, &usage);
+
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
+ continue;
+
+ nr_replicas += ca->mi.durability;
+ if (nr_replicas >= m->op.nr_replicas)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nr_replicas)
+ return -BCH_ERR_data_update_done_no_rw_devs;
+ if (nr_replicas < m->op.nr_replicas)
+ return -BCH_ERR_insufficient_devices;
+ return 0;
+}
+
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
- struct bch_io_opts io_opts,
+ struct bch_io_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
@@ -640,16 +734,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* snapshots table - just skip it, we can move it later.
*/
if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
- return -BCH_ERR_data_update_done;
-
- if (!bkey_get_dev_refs(c, k))
- return -BCH_ERR_data_update_done;
-
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, k)) {
- bkey_put_dev_refs(c, k);
- return -BCH_ERR_nocow_lock_blocked;
- }
+ return -BCH_ERR_data_update_done_no_snapshot;
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -658,18 +743,18 @@ int bch2_data_update_init(struct btree_trans *trans,
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
- bch2_write_op_init(&m->op, c, io_opts);
+ bch2_write_op_init(&m->op, c, *io_opts);
m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
- BCH_WRITE_PAGES_OWNED|
- BCH_WRITE_DATA_ENCODED|
- BCH_WRITE_MOVE|
+ m->op.flags |= BCH_WRITE_pages_stable|
+ BCH_WRITE_pages_owned|
+ BCH_WRITE_data_encoded|
+ BCH_WRITE_move|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression;
+ m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
@@ -707,7 +792,7 @@ int bch2_data_update_init(struct btree_trans *trans,
ptr_bit <<= 1;
}
- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
@@ -740,28 +825,70 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
- goto out;
+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
+ if (!ret)
+ ret = -BCH_ERR_data_update_done_no_writes_needed;
+ goto out_bkey_buf_exit;
}
+ /*
+ * Check if the allocation will succeed, to avoid getting an error later
+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
+ * read:
+ *
+ * This guards against
+ * - BCH_WRITE_alloc_nowait allocations failing (promotes)
+ * - Destination target full
+ * - Device(s) in destination target offline
+ * - Insufficient durability available in destination target
+ * (i.e. trying to move a durability=2 replica to a target with a
+ * single durability=2 device)
+ */
+ ret = can_write_extent(c, m);
+ if (ret)
+ goto out_bkey_buf_exit;
+
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL);
if (ret)
- goto out;
+ goto out_bkey_buf_exit;
+ }
+
+ if (!bkey_get_dev_refs(c, k)) {
+ ret = -BCH_ERR_data_update_done_no_dev_refs;
+ goto out_put_disk_res;
+ }
+
+ if (c->opts.nocow_enabled &&
+ !bkey_nocow_lock(c, ctxt, k)) {
+ ret = -BCH_ERR_nocow_lock_blocked;
+ goto out_put_dev_refs;
}
if (bkey_extent_is_unwritten(k)) {
- bch2_update_unwritten_extent(trans, m);
- goto out;
+ ret = bch2_update_unwritten_extent(trans, m) ?:
+ -BCH_ERR_data_update_done_unwritten;
+ goto out_nocow_unlock;
}
+ ret = bch2_data_update_bios_init(m, c, io_opts);
+ if (ret)
+ goto out_nocow_unlock;
+
return 0;
-out:
- bch2_data_update_exit(m);
- return ret ?: -BCH_ERR_data_update_done;
+out_nocow_unlock:
+ if (c->opts.nocow_enabled)
+ bkey_nocow_unlock(c, k);
+out_put_dev_refs:
+ bkey_put_dev_refs(c, k);
+out_put_disk_res:
+ bch2_disk_reservation_put(c, &m->op.res);
+out_bkey_buf_exit:
+ bch2_bkey_buf_exit(&m->k, c);
+ return ret;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index e4b50723428e..c194cbbf5b51 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -4,6 +4,7 @@
#define _BCACHEFS_DATA_UPDATE_H
#include "bkey_buf.h"
+#include "io_read.h"
#include "io_write_types.h"
struct moving_context;
@@ -15,6 +16,9 @@ struct data_update_opts {
u8 extra_replicas;
unsigned btree_insert_flags;
unsigned write_flags;
+
+ int read_dev;
+ bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
@@ -22,20 +26,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct data_update {
/* extent being updated: */
+ bool read_done;
enum btree_id btree_id;
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
struct bch_move_stats *stats;
+
+ struct bch_read_bio rbio;
struct bch_write_op op;
+ struct bio_vec *bvecs;
};
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
int bch2_data_update_index_update(struct bch_write_op *);
-void bch2_data_update_read_done(struct data_update *,
- struct bch_extent_crc_unpacked);
+void bch2_data_update_read_done(struct data_update *);
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
@@ -43,12 +51,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
struct bch_io_opts *,
struct data_update_opts *);
+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
+ struct bch_io_opts *);
+
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
- struct bch_io_opts, struct data_update_opts,
+ struct bch_io_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 55333e82d1fe..788af88f6979 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -7,6 +7,7 @@
*/
#include "bcachefs.h"
+#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -190,7 +191,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
unsigned offset = 0;
int ret;
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
prt_printf(out, "error getting device to read from: invalid device\n");
return;
}
@@ -844,8 +845,11 @@ restart:
seqmutex_unlock(&c->btree_trans_lock);
}
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
+
+static ssize_t bch2_simple_print(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos,
+ fs_to_text_fn fn)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
@@ -856,7 +860,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
i->ret = 0;
if (!i->iter) {
- btree_deadlock_to_text(&i->buf, c);
+ fn(&i->buf, c);
i->iter++;
}
@@ -869,6 +873,12 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
return ret ?: i->ret;
}
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
+}
+
static const struct file_operations btree_deadlock_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
@@ -876,6 +886,19 @@ static const struct file_operations btree_deadlock_ops = {
.read = bch2_btree_deadlock_read,
};
+static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
+}
+
+static const struct file_operations write_points_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_write_points_read,
+};
+
void bch2_fs_debug_exit(struct bch_fs *c)
{
if (!IS_ERR_OR_NULL(c->fs_debug_dir))
@@ -927,6 +950,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
c->btree_debug, &btree_deadlock_ops);
+ debugfs_create_file("write_points", 0400, c->fs_debug_dir,
+ c->btree_debug, &write_points_ops);
+
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 600eee936f13..d7f9f79318a2 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -13,6 +13,40 @@
#include <linux/dcache.h>
+static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
+
+#ifdef CONFIG_UNICODE
+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
+ int ret = PTR_ERR_OR_ZERO(buf);
+ if (ret)
+ return ret;
+
+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
+ if (ret <= 0)
+ return ret;
+
+ *out_cf = (struct qstr) QSTR_INIT(buf, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
+ const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ if (likely(!info->cf_encoding)) {
+ *out_cf = *str;
+ return 0;
+ } else {
+ return bch2_casefold(trans, info, str, out_cf);
+ }
+}
+
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
@@ -28,13 +62,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
#endif
return bkey_bytes -
- offsetof(struct bch_dirent, d_name) -
+ (d.v->d_casefold
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
+ : offsetof(struct bch_dirent, d_name)) -
trailing_nuls;
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
{
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ }
+}
+
+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
+{
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(NULL, 0);
+ }
+}
+
+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
+{
+ return d.v->d_casefold
+ ? bch2_dirent_get_casefold_name(d)
+ : bch2_dirent_get_name(d);
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -57,7 +116,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(d);
+ struct qstr name = bch2_dirent_get_lookup_name(d);
return bch2_dirent_hash(info, &name);
}
@@ -65,7 +124,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
const struct qstr *r_name = _r;
return !qstr_eq(l_name, *r_name);
@@ -75,8 +134,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- const struct qstr l_name = bch2_dirent_get_name(l);
- const struct qstr r_name = bch2_dirent_get_name(r);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
+ const struct qstr r_name = bch2_dirent_get_lookup_name(r);
return !qstr_eq(l_name, r_name);
}
@@ -104,17 +163,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ unsigned name_block_len = bch2_dirent_name_bytes(d);
struct qstr d_name = bch2_dirent_get_name(d);
+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
int ret = 0;
bkey_fsck_err_on(!d_name.len,
c, dirent_empty_name,
"empty name");
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
c, dirent_val_too_big,
- "value too big (%zu > %u)",
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+ "dirent names exceed bkey size (%d + %d > %d)",
+ d_name.len, d_cf_name.len, name_block_len);
/*
* Check new keys don't exceed the max length
@@ -142,6 +203,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
c, dirent_to_itself,
"dirent points to own directory");
+
+ if (d.v->d_casefold) {
+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
+ d_cf_name.len > BCH_NAME_MAX,
+ c, dirent_cf_name_too_big,
+ "dirent w/ cf name too big (%u > %u)",
+ d_cf_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
+ c, dirent_stray_data_after_cf_name,
+ "dirent has stray data after cf name's NUL");
+ }
fsck_err:
return ret;
}
@@ -163,15 +236,14 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
- subvol_inum dir, u8 type,
- const struct qstr *name, u64 dst)
+static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
+ subvol_inum dir,
+ u8 type,
+ int name_len, int cf_name_len,
+ u64 dst)
{
struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
-
- if (name->len > BCH_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
BUG_ON(u64s > U8_MAX);
@@ -190,14 +262,65 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
dirent->v.d_type = type;
+ dirent->v.d_unused = 0;
+ dirent->v.d_casefold = cf_name_len ? 1 : 0;
- memcpy(dirent->v.d_name, name->name, name->len);
- memset(dirent->v.d_name + name->len, 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
+ return dirent;
+}
- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
+ const struct qstr *name)
+{
+ EBUG_ON(dirent->v.d_casefold);
+
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
+ memset(&dirent->v.d_name[name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
+}
+
+static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
+ const struct qstr *name,
+ const struct qstr *cf_name)
+{
+ EBUG_ON(!dirent->v.d_casefold);
+ EBUG_ON(!cf_name->len);
+
+ dirent->v.d_cf_name_block.d_name_len = name->len;
+ dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
+ memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+ name->len + cf_name->len);
+
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+ subvol_inum dir,
+ u8 type,
+ const struct qstr *name,
+ const struct qstr *cf_name,
+ u64 dst)
+{
+ struct bkey_i_dirent *dirent;
+
+ if (name->len > BCH_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
+ if (IS_ERR(dirent))
+ return dirent;
+
+ if (cf_name)
+ dirent_init_casefolded_name(dirent, name, cf_name);
+ else
+ dirent_init_regular_name(dirent, name);
+
+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
return dirent;
}
@@ -213,7 +336,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -233,16 +356,28 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
+ u64 *i_size,
enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+ if (hash_info->cf_encoding) {
+ struct qstr cf_name;
+ ret = bch2_casefold(trans, hash_info, name, &cf_name);
+ if (ret)
+ return ret;
+ dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
+ } else {
+ dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
+ }
+
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
+ *i_size += bkey_bytes(&dirent->k);
+
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
@@ -275,12 +410,13 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
}
int bch2_dirent_rename(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_hash_info *src_hash,
- subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+ subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
+ struct qstr src_name_lookup, dst_name_lookup;
struct btree_iter src_iter = { NULL };
struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
@@ -295,8 +431,11 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
+ if (ret)
+ goto out;
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
+ src_hash, src_dir, &src_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
@@ -308,6 +447,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
goto out;
/* Lookup dst: */
+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
+ if (ret)
+ goto out;
if (mode == BCH_RENAME) {
/*
* Note that we're _not_ checking if the target already exists -
@@ -315,12 +457,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
* correctness:
*/
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name);
+ dst_hash, dst_dir, &dst_name_lookup);
if (ret)
goto out;
} else {
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
+ dst_hash, dst_dir, &dst_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
@@ -336,7 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@@ -346,7 +489,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+ new_src = dirent_create_key(trans, src_dir, 0, src_name,
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@@ -406,6 +550,14 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+ if (old_dst.k)
+ *dst_dir_i_size -= bkey_bytes(old_dst.k);
+ *src_dir_i_size -= bkey_bytes(old_src.k);
+
+ if (mode == BCH_RENAME_EXCHANGE)
+ *src_dir_i_size += bkey_bytes(&new_src->k);
+ *dst_dir_i_size += bkey_bytes(&new_dst->k);
+
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
@@ -465,9 +617,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
+ struct qstr lookup_name;
+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
+ if (ret)
+ return ret;
+
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- int ret = bkey_err(k);
+ hash_info, dir, &lookup_name, flags);
+ ret = bkey_err(k);
if (ret)
goto err;
@@ -572,3 +729,54 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
return ret < 0 ? ret : 0;
}
+
+/* fsck */
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
+ }
+ ret = -BCH_ERR_ENOENT_inode;
+found:
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bch_inode_unpacked dir_inode;
+ struct bch_hash_info dir_hash_info;
+ int ret;
+
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+ if (ret)
+ goto err;
+
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 362b3b2f2f2e..0880772b80a9 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -25,10 +25,13 @@ struct bch_inode_info;
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
-static inline unsigned dirent_val_u64s(unsigned len)
+static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
{
- return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
- sizeof(u64));
+ unsigned bytes = cf_len
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
+ : offsetof(struct bch_dirent, d_name) + len;
+
+ return DIV_ROUND_UP(bytes, sizeof(u64));
}
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
@@ -47,7 +50,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *,
+ const struct qstr *, u64, u64 *, u64 *,
enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
@@ -62,8 +65,8 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- subvol_inum, struct bch_hash_info *,
- subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *, u64 *,
+ subvol_inum, struct bch_hash_info *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
@@ -79,4 +82,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
+
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
index 5e116b88e814..a46dbddd21aa 100644
--- a/fs/bcachefs/dirent_format.h
+++ b/fs/bcachefs/dirent_format.h
@@ -29,9 +29,25 @@ struct bch_dirent {
* Copy of mode bits 12-15 from the target inode - so userspace can get
* the filetype without having to do a stat()
*/
- __u8 d_type;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 d_type:5,
+ d_unused:2,
+ d_casefold:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 d_casefold:1,
+ d_unused:2,
+ d_type:5;
+#endif
- __u8 d_name[];
+ union {
+ struct {
+ __u8 d_pad;
+ __le16 d_name_len;
+ __le16 d_cf_name_len;
+ __u8 d_names[];
+ } d_cf_name_block __packed;
+ __DECLARE_FLEX_ARRAY(__u8, d_name);
+ } __packed;
} __packed __aligned(8);
#define DT_SUBVOL 16
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f4372cafea2e..f9214e2d1346 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -85,6 +85,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
s64 *, unsigned, bool);
+
+#define disk_accounting_key_init(_k, _type, ...) \
+do { \
+ memset(&(_k), 0, sizeof(_k)); \
+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \
+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
+} while (0)
+
+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
+({ \
+ struct disk_accounting_pos pos; \
+ disk_accounting_key_init(pos, __VA_ARGS__); \
+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
+})
+
+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
+
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 7b6e6c97e6aa..15190196485f 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -113,14 +113,14 @@ enum disk_accounting_type {
BCH_DISK_ACCOUNTING_TYPE_NR,
};
-struct bch_nr_inodes {
+struct bch_acct_nr_inodes {
};
-struct bch_persistent_reserved {
+struct bch_acct_persistent_reserved {
__u8 nr_replicas;
};
-struct bch_dev_data_type {
+struct bch_acct_dev_data_type {
__u8 dev;
__u8 data_type;
};
@@ -149,10 +149,10 @@ struct disk_accounting_pos {
struct {
__u8 type;
union {
- struct bch_nr_inodes nr_inodes;
- struct bch_persistent_reserved persistent_reserved;
+ struct bch_acct_nr_inodes nr_inodes;
+ struct bch_acct_persistent_reserved persistent_reserved;
struct bch_replicas_entry_v1 replicas;
- struct bch_dev_data_type dev_data_type;
+ struct bch_acct_dev_data_type dev_data_type;
struct bch_acct_compression compression;
struct bch_acct_snapshot snapshot;
struct bch_acct_btree btree;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d2a5e76e6479..f2b9225fe0bc 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -20,6 +20,7 @@
#include "io_read.h"
#include "io_write.h"
#include "keylist.h"
+#include "lru.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -104,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ u64 submit_time;
struct bio bio;
};
@@ -298,10 +300,22 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
if (flags & BTREE_TRIGGER_transactional) {
+ struct extent_ptr_decoded p = {
+ .ptr = *ptr,
+ .crc = bch2_extent_crc_unpack(s.k, NULL),
+ };
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
+ (const union bch_extent_entry *) ptr, &bp);
+
struct bkey_i_alloc_v4 *a =
bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
+ !(flags & BTREE_TRIGGER_overwrite));
+ if (ret)
+ goto err;
}
if (flags & BTREE_TRIGGER_gc) {
@@ -366,19 +380,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
-{
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->disk_label = s->disk_label;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-}
-
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -399,6 +400,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
(new_s->nr_blocks != old_s->nr_blocks ||
new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_transactional) {
+ int ret = bch2_lru_change(trans,
+ BCH_LRU_STRIPE_FRAGMENTATION,
+ idx,
+ stripe_lru_pos(old_s),
+ stripe_lru_pos(new_s));
+ if (ret)
+ return ret;
+ }
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
@@ -472,38 +482,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- if (flags & BTREE_TRIGGER_atomic) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- stripe_to_mem(m, new_s);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- }
-
return 0;
}
@@ -726,14 +704,15 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "erasure coding %s error: %s",
+ bch2_account_io_completion(ca, bio_data_dir(bio),
+ ec_bio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
int stale = dev_ptr_stale(ca, ptr);
if (stale) {
@@ -796,6 +775,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
@@ -917,26 +897,6 @@ err:
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
- if (idx >= h->size) {
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- if (n.size > h->size) {
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
- n.nr = h->nr;
- swap(*h, n);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- free_heap(&n);
- }
-
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -1009,180 +969,50 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
s->idx = 0;
}
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
-
- lockdep_assert_held(&c->ec_stripes_heap_lock);
-
- if (h->nr &&
- h->data[0].blocks_nonempty == 0 &&
- !bch2_stripe_is_open(c, h->data[0].idx))
- return h->data[0].idx;
-
- return 0;
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
- size_t i)
-{
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
- (_l->blocks_nonempty < _r->blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
- size_t i = _l - _h->data;
- size_t j = _r - _h->data;
-
- swap(*_l, *_r);
-
- ec_stripes_heap_set_backpointer(_h, i);
- ec_stripes_heap_set_backpointer(_h, j);
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
-};
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- BUG_ON(m->heap_idx >= h->nr);
- BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
-
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
- .idx = idx,
- .blocks_nonempty = m->blocks_nonempty,
- }),
- &callbacks,
- &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- bool do_deletes;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
- i = m->heap_idx;
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
-
- do_deletes = stripe_idx_to_delete(c) != 0;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (do_deletes)
- bch2_do_stripe_deletes(c);
-}
-
/* stripe deletion */
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_stripe s;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
- ret = -EINVAL;
- goto err;
- }
-
- s = bkey_s_c_to_stripe(k);
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
- if (stripe_blockcount_get(s.v, i)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ /*
+ * We expect write buffer races here
+ * Important: check stripe_is_open with stripe key locked:
+ */
+ if (k.k->type == KEY_TYPE_stripe &&
+ !bch2_stripe_is_open(trans->c, idx) &&
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+/*
+ * XXX
+ * can we kill this and delete stripes from the trigger?
+ */
static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- while (1) {
- mutex_lock(&c->ec_stripes_heap_lock);
- u64 idx = stripe_idx_to_delete(c);
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (!idx)
- break;
-
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_delete(trans, idx));
- bch_err_fn(c, ret);
- if (ret)
- break;
- }
-
+ bch2_trans_run(c,
+ bch2_btree_write_buffer_tryflush(trans) ?:
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
+ 0, lru_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ec_stripe_delete(trans, lru_k.k->p.offset);
+ })));
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@@ -1294,7 +1124,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return -BCH_ERR_erasure_coding_found_btree_node;
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1360,7 +1190,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -EIO;
+ return -BCH_ERR_ENOENT_dev_not_found;
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1380,8 +1210,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
if (bp_k.k->type != KEY_TYPE_backpointer)
continue;
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+ if (bp.v->btree_id == BTREE_ID_stripes)
+ continue;
+
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
- bkey_s_c_to_backpointer(bp_k), &last_flushed);
+ bp, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
@@ -1393,21 +1227,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret = 0;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
- ret = bch2_btree_write_buffer_flush_sync(trans);
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
- for (i = 0; i < nr_data; i++) {
+ for (unsigned i = 0; i < nr_data; i++) {
ret = ec_stripe_update_bucket(trans, s, i);
if (ret)
break;
}
err:
bch2_trans_put(trans);
-
return ret;
}
@@ -1473,6 +1305,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
+ ret = s->err;
goto err;
}
@@ -1481,6 +1314,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
+ ret = -BCH_ERR_ec_block_read;
goto err;
}
@@ -1506,6 +1340,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
+ ret = -BCH_ERR_ec_block_write;
goto err;
}
@@ -1527,6 +1362,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ret)
goto err;
err:
+ trace_stripe_create(c, s->idx, ret);
+
bch2_disk_reservation_put(c, &s->res);
for (i = 0; i < v->nr_blocks; i++)
@@ -1612,11 +1449,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
ec_stripe_new_set_pending(c, h);
}
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
{
struct ec_stripe_new *s = ob->ec;
- s->err = -EIO;
+ s->err = err;
}
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
@@ -1968,39 +1805,40 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
return 0;
}
-static s64 get_existing_stripe(struct bch_fs *c,
- struct ec_stripe_head *head)
+static int __get_existing_stripe(struct btree_trans *trans,
+ struct ec_stripe_head *head,
+ struct ec_stripe_buf *stripe,
+ u64 idx)
{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t heap_idx;
- u64 stripe_idx;
- s64 ret = -1;
-
- if (may_create_new_stripe(c))
- return -1;
+ struct bch_fs *c = trans->c;
- mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
- /* No blocks worth reusing, stripe will just be deleted: */
- if (!h->data[heap_idx].blocks_nonempty)
- continue;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- stripe_idx = h->data[heap_idx].idx;
+ /* We expect write buffer races here */
+ if (k.k->type != KEY_TYPE_stripe)
+ goto out;
- m = genradix_ptr(&c->stripes, stripe_idx);
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ if (stripe_lru_pos(s.v) <= 1)
+ goto out;
- if (m->disk_label == head->disk_label &&
- m->algorithm == head->algo &&
- m->nr_redundant == head->redundancy &&
- m->sectors == head->blocksize &&
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
- ret = stripe_idx;
- break;
- }
+ if (s.v->disk_label == head->disk_label &&
+ s.v->algorithm == head->algo &&
+ s.v->nr_redundant == head->redundancy &&
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
+ bch2_try_open_stripe(c, head->s, idx)) {
+ bkey_reassemble(&stripe->key, k);
+ ret = 1;
}
- mutex_unlock(&c->ec_stripes_heap_lock);
+out:
+ bch2_set_btree_iter_dontneed(&iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2052,24 +1890,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
- s64 idx;
- int ret;
/*
* If we can't allocate a new stripe, and there's no stripes with empty
* blocks for us to reuse, that means we have to wait on copygc:
*/
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
+ if (may_create_new_stripe(c))
+ return -1;
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
+ 0, lru_k, ret) {
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
+ if (ret)
+ break;
}
+ bch2_trans_iter_exit(trans, &lru_iter);
+ if (!ret)
+ ret = -BCH_ERR_stripe_alloc_blocked;
+ if (ret == 1)
+ ret = 0;
+ if (ret)
+ return ret;
return init_new_stripe_from_existing(c, s);
}
@@ -2367,46 +2214,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
-
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- 0;
- })));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
- m = genradix_ptr(&c->stripes, h->data[i].idx);
-
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
- h->data[i].blocks_nonempty,
- m->nr_blocks - m->nr_redundant,
- m->nr_redundant);
- if (bch2_stripe_is_open(c, h->data[i].idx))
- prt_str(out, " open");
- prt_newline(out);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
+ return 0;
}
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -2477,15 +2285,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
BUG_ON(!list_empty(&c->ec_stripe_new_list));
- free_heap(&c->ec_stripes_heap);
- genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
- mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
@@ -2503,3 +2308,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
+
+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bkey_buf *last_flushed)
+{
+ if (k.k->type != KEY_TYPE_stripe)
+ return 0;
+
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ u64 lru_idx = stripe_lru_pos(s.v);
+ if (lru_idx) {
+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
+ k.k->p.offset, lru_idx, k, last_flushed);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
+{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
+ POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 583ca6a226da..62d27e04d763 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s,
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
}
+#define STRIPE_LRU_POS_EMPTY 1
+
+static inline u64 stripe_lru_pos(const struct bch_stripe *s)
+{
+ if (!s)
+ return 0;
+
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
+
+ for (unsigned i = 0; i < nr_data; i++)
+ blocks_empty += !stripe_blockcount_get(s, i);
+
+ /* Will be picked up by the stripe_delete worker */
+ if (blocks_empty == nr_data)
+ return STRIPE_LRU_POS_EMPTY;
+
+ if (!blocks_empty)
+ return 0;
+
+ /* invert: more blocks empty = reuse first */
+ return LRU_TIME_MAX - blocks_empty;
+}
+
static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
@@ -132,6 +155,20 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
m->sectors);
}
+static inline void gc_stripe_unlock(struct gc_stripe *s)
+{
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
+ wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
+}
+
+static inline void gc_stripe_lock(struct gc_stripe *s)
+{
+ wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
+ TASK_UNINTERRUPTIBLE);
+}
+
struct bch_read_bio;
struct ec_stripe_buf {
@@ -212,7 +249,7 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
@@ -221,10 +258,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
unsigned, unsigned, unsigned,
enum bch_watermark, struct closure *);
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
-
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
@@ -261,11 +294,12 @@ void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
void bch2_fs_ec_init_early(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
+int bch2_check_stripe_to_lru_refs(struct bch_fs *);
+
#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 8d1e70e830ac..06144bfd9c19 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -20,23 +20,15 @@ struct stripe {
};
struct gc_stripe {
+ u8 lock;
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 sectors;
-
u8 nr_blocks;
u8 nr_redundant;
-
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
struct bch_replicas_padded r;
};
-struct ec_stripe_heap_entry {
- size_t idx;
- unsigned blocks_nonempty;
-};
-
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
-
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 4590cd0c7c90..101806d7ebe1 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -116,9 +116,11 @@
x(ENOENT, ENOENT_snapshot_tree) \
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
+ x(ENOENT, ENOENT_dev_bucket_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
+ x(ENOENT, btree_node_dying) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@@ -180,6 +182,12 @@
x(EINVAL, not_in_recovery) \
x(EINVAL, cannot_rewind_recovery) \
x(0, data_update_done) \
+ x(BCH_ERR_data_update_done, data_update_done_would_block) \
+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \
+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
x(EINVAL, mismatched_block_size) \
@@ -200,6 +208,8 @@
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
x(EINVAL, varint_decode_error) \
+ x(EINVAL, erasure_coding_found_btree_node) \
+ x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -210,10 +220,18 @@
x(EROFS, insufficient_devices) \
x(0, operation_blocked) \
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
- x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
- x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
- x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
- x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, journal_res_blocked) \
+ x(BCH_ERR_journal_res_blocked, journal_blocked) \
+ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
+ x(BCH_ERR_journal_res_blocked, journal_max_open) \
+ x(BCH_ERR_journal_res_blocked, journal_full) \
+ x(BCH_ERR_journal_res_blocked, journal_pin_full) \
+ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
+ x(BCH_ERR_journal_res_blocked, journal_stuck) \
+ x(BCH_ERR_journal_res_blocked, journal_retry_open) \
+ x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \
+ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
+ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
x(BCH_ERR_invalid, invalid_sb) \
x(BCH_ERR_invalid_sb, invalid_sb_magic) \
x(BCH_ERR_invalid_sb, invalid_sb_version) \
@@ -223,6 +241,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
@@ -250,6 +269,7 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, journal_shutdown) \
x(EIO, journal_flush_err) \
+ x(EIO, journal_write_err) \
x(EIO, btree_node_read_err) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
x(EIO, sb_not_downgraded) \
@@ -258,17 +278,52 @@
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
+ x(EIO, trigger_alloc) \
x(EIO, trigger_pointer) \
x(EIO, trigger_stripe_pointer) \
x(EIO, metadata_bucket_inconsistency) \
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
- x(EIO, no_device_to_read_from) \
+ x(EIO, extent_poisened) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
+ x(EIO, device_offline) \
+ x(EIO, EIO_fault_injected) \
+ x(EIO, ec_block_read) \
+ x(EIO, ec_block_write) \
+ x(EIO, recompute_checksum) \
+ x(EIO, decompress) \
+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
+ x(BCH_ERR_decompress, decompress_lz4) \
+ x(BCH_ERR_decompress, decompress_gzip) \
+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
+ x(BCH_ERR_decompress, decompress_zstd) \
+ x(EIO, data_write) \
+ x(BCH_ERR_data_write, data_write_io) \
+ x(BCH_ERR_data_write, data_write_csum) \
+ x(BCH_ERR_data_write, data_write_invalid_ptr) \
+ x(BCH_ERR_data_write, data_write_misaligned) \
+ x(BCH_ERR_decompress, data_read) \
+ x(BCH_ERR_data_read, no_device_to_read_from) \
+ x(BCH_ERR_data_read, data_read_io_err) \
+ x(BCH_ERR_data_read, data_read_csum_err) \
+ x(BCH_ERR_data_read, data_read_retry) \
+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
+ x(BCH_ERR_data_read, data_read_decompress_err) \
+ x(BCH_ERR_data_read, data_read_decrypt_err) \
+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \
+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
+ x(BCH_ERR_data_read, data_read_no_encryption_key) \
+ x(BCH_ERR_data_read, data_read_buffer_too_small) \
+ x(BCH_ERR_data_read, data_read_key_overwritten) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 038da6a61f6b..207f35d3cce2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -3,8 +3,8 @@
#include "btree_cache.h"
#include "btree_iter.h"
#include "error.h"
-#include "fs-common.h"
#include "journal.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
@@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
- bool dev;
+
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
down_write(&c->state_lock);
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
- if (dev
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED)
- : bch2_fs_emergency_read_only(c))
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+ if (write_errors_start &&
+ time_after(jiffies,
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+ goto out;
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED);
+
bch_err(ca,
- "too many IO errors, setting %s RO",
+ "writes erroring for %u seconds, setting %s ro",
+ c->opts.write_error_timeout,
dev ? "device" : "filesystem");
+ if (!dev)
+ bch2_fs_emergency_read_only(c);
+
+ }
+out:
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
- //queue_work(system_long_wq, &ca->io_error_work);
+
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+ ca->write_errors_start = jiffies;
+
+ queue_work(system_long_wq, &ca->io_error_work);
}
enum ask_yn {
@@ -530,35 +546,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
-int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ subvol_inum inum, u64 offset)
{
u32 restart_count = trans->restart_count;
int ret = 0;
- /* XXX: we don't yet attempt to print paths when we don't know the subvol */
- if (inum.subvol)
- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+ if (inum.subvol) {
+ ret = bch2_inum_to_path(trans, inum, out);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ }
if (!inum.subvol || ret)
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+ prt_printf(out, " offset %llu: ", offset);
return trans_was_restarted(trans, restart_count);
}
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- subvol_inum inum, u64 offset)
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ subvol_inum inum, u64 offset)
{
- int ret = bch2_inum_err_msg_trans(trans, out, inum);
- prt_printf(out, " offset %llu: ", offset);
- return ret;
+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
}
-void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bpos pos)
{
- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ if (!bch2_snapshot_is_leaf(c, pos.snapshot))
+ prt_str(out, "(multiple snapshots) ");
+
+ subvol_inum inum = {
+ .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot),
+ .inum = pos.inode,
+ };
+
+ if (inum.subvol) {
+ ret = bch2_inum_to_path(trans, inum, out);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ }
+
+ if (!inum.subvol || ret)
+ prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot);
+
+ prt_printf(out, " offset %llu: ", pos.offset << 8);
+ return 0;
}
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- subvol_inum inum, u64 offset)
+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ struct bpos pos)
{
- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
+ bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 7acf2a27ca28..7d3f0e2a5fd6 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-#define bch2_dev_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_dev_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
-
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ bool success)
+{
+ if (likely(success)) {
+ if (type == BCH_MEMBER_ERROR_write &&
+ ca->write_errors_start)
+ ca->write_errors_start = 0;
+ } else {
+ bch2_io_error(ca, type);
+ }
+}
+
+static inline void bch2_account_io_completion(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ u64 submit_time, bool success)
+{
+ if (unlikely(!ca))
+ return;
+
+ if (type != BCH_MEMBER_ERROR_checksum)
+ bch2_latency_acct(ca, submit_time, type);
+
+ bch2_account_io_success_fail(ca, type, success);
+}
-int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
+
#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 2d8042f853dc..ae1a1d917805 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -28,6 +28,13 @@
#include "trace.h"
#include "util.h"
+static const char * const bch2_extent_flags_strs[] = {
+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
+ BCH_EXTENT_FLAGS()
+#undef x
+ NULL,
+};
+
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -51,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
- struct extent_ptr_decoded *p)
+ struct extent_ptr_decoded *p,
+ bool csum_error)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
@@ -59,53 +67,57 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
- f->dev = p->ptr.dev;
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else if (p->idx != f->idx) {
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else {
- f->nr_failed++;
+ memset(f, 0, sizeof(*f));
+ f->dev = p->ptr.dev;
}
+
+ if (p->do_ec_reconstruct)
+ f->failed_ec = true;
+ else if (!csum_error)
+ f->failed_io = true;
+ else
+ f->failed_csum_nr++;
}
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+static inline u64 dev_latency(struct bch_dev *ca)
{
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
}
+static inline int dev_failed(struct bch_dev *ca)
+{
+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+}
+
/*
* returns true if p1 is better than p2:
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
- const struct extent_ptr_decoded p2)
+ u64 p1_latency,
+ struct bch_dev *ca1,
+ const struct extent_ptr_decoded p2,
+ u64 p2_latency)
{
- if (likely(!p1.idx && !p2.idx)) {
- u64 l1 = dev_latency(c, p1.ptr.dev);
- u64 l2 = dev_latency(c, p2.ptr.dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
- /*
- * Square the latencies, to bias more in favor of the faster
- * device - we never want to stop issuing reads to the slower
- * device altogether, so that we can update our latency numbers:
- */
- l1 *= l1;
- l2 *= l2;
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+ if (unlikely(failed_delta))
+ return failed_delta < 0;
- /* Pick at random, biased in favor of the faster device: */
+ if (unlikely(bch2_force_reconstruct_read))
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
- return bch2_get_random_u64_below(l1 + l2) > l1;
- }
+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
+
+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+ if (unlikely(crc_retry_delta))
+ return crc_retry_delta < 0;
- if (bch2_force_reconstruct_read)
- return p1.idx > p2.idx;
+ /* Pick at random, biased in favor of the faster device: */
- return p1.idx < p2.idx;
+ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
}
/*
@@ -115,64 +127,108 @@ static inline bool ptr_better(struct bch_fs *c,
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_failures *failed,
- struct extent_ptr_decoded *pick)
+ struct extent_ptr_decoded *pick,
+ int dev)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_dev_io_failures *f;
- int ret = 0;
+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+ bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return -BCH_ERR_extent_poisened;
+
rcu_read_lock();
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 pick_latency;
+
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ have_dirty_ptrs |= !p.ptr.cached;
+
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
- ret = 0;
- break;
+ rcu_read_unlock();
+ return 0;
}
- /*
- * If there are any dirty pointers it's an error if we can't
- * read:
- */
- if (!ret && !p.ptr.cached)
- ret = -BCH_ERR_no_device_to_read_from;
+ /* Are we being asked to read from a specific device? */
+ if (dev >= 0 && p.ptr.dev != dev)
+ continue;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
- if (f)
- p.idx = f->nr_failed < f->nr_retries
- ? f->idx
- : f->idx + 1;
+ struct bch_dev_io_failures *f =
+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+ if (unlikely(f)) {
+ p.crc_retry_nr = f->failed_csum_nr;
+ p.has_ec &= ~f->failed_ec;
- if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
- p.idx++;
+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+ have_io_errors |= f->failed_io;
+ have_io_errors |= f->failed_ec;
+ }
+ have_csum_errors |= !!f->failed_csum_nr;
- if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
- p.idx++;
+ if (p.has_ec && (f->failed_io || f->failed_csum_nr))
+ p.do_ec_reconstruct = true;
+ else if (f->failed_io ||
+ f->failed_csum_nr > c->opts.checksum_err_retry_nr)
+ continue;
+ }
- if (p.idx > (unsigned) p.has_ec)
- continue;
+ have_missing_devs |= ca && !bch2_dev_is_online(ca);
- if (ret > 0 && !ptr_better(c, p, *pick))
- continue;
+ if (!ca || !bch2_dev_is_online(ca)) {
+ if (!p.has_ec)
+ continue;
+ p.do_ec_reconstruct = true;
+ }
+
+ if (bch2_force_reconstruct_read && p.has_ec)
+ p.do_ec_reconstruct = true;
- *pick = p;
- ret = 1;
+ u64 p_latency = dev_latency(ca);
+ /*
+ * Square the latencies, to bias more in favor of the faster
+ * device - we never want to stop issuing reads to the slower
+ * device altogether, so that we can update our latency numbers:
+ */
+ p_latency *= p_latency;
+
+ if (!have_pick ||
+ ptr_better(c,
+ p, p_latency, ca,
+ *pick, pick_latency)) {
+ *pick = p;
+ pick_latency = p_latency;
+ have_pick = true;
+ }
}
rcu_read_unlock();
- return ret;
+ if (have_pick)
+ return 1;
+ if (!have_dirty_ptrs)
+ return 0;
+ if (have_missing_devs)
+ return -BCH_ERR_no_device_to_read_from;
+ if (have_csum_errors)
+ return -BCH_ERR_data_read_csum_err;
+ if (have_io_errors)
+ return -BCH_ERR_data_read_io_err;
+
+ WARN_ONCE(1, "unhandled error case in %s\n", __func__);
+ return -EINVAL;
}
/* KEY_TYPE_btree_ptr: */
@@ -536,29 +592,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
struct bch_extent_crc_unpacked src,
enum bch_extent_entry_type type)
{
-#define set_common_fields(_dst, _src) \
- _dst.type = 1 << type; \
- _dst.csum_type = _src.csum_type, \
- _dst.compression_type = _src.compression_type, \
- _dst._compressed_size = _src.compressed_size - 1, \
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
- _dst.offset = _src.offset
+#define common_fields(_src) \
+ .type = BIT(type), \
+ .csum_type = _src.csum_type, \
+ .compression_type = _src.compression_type, \
+ ._compressed_size = _src.compressed_size - 1, \
+ ._uncompressed_size = _src.uncompressed_size - 1, \
+ .offset = _src.offset
switch (type) {
case BCH_EXTENT_ENTRY_crc32:
- set_common_fields(dst->crc32, src);
- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
+ dst->crc32 = (struct bch_extent_crc32) {
+ common_fields(src),
+ .csum = (u32 __force) *((__le32 *) &src.csum.lo),
+ };
break;
case BCH_EXTENT_ENTRY_crc64:
- set_common_fields(dst->crc64, src);
- dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = (u64 __force) src.csum.lo;
- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
+ dst->crc64 = (struct bch_extent_crc64) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum_lo = (u64 __force) src.csum.lo,
+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
+ };
break;
case BCH_EXTENT_ENTRY_crc128:
- set_common_fields(dst->crc128, src);
- dst->crc128.nonce = src.nonce;
- dst->crc128.csum = src.csum;
+ dst->crc128 = (struct bch_extent_crc128) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum = src.csum,
+ };
break;
default:
BUG();
@@ -997,7 +1059,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
@@ -1220,6 +1282,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
+ case BCH_EXTENT_ENTRY_flags:
+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
+ break;
+
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1381,6 +1447,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
#endif
break;
}
+ case BCH_EXTENT_ENTRY_flags:
+ bkey_fsck_err_on(entry != ptrs.start,
+ c, extent_flags_not_at_start,
+ "extent flags entry not at start");
+ break;
}
}
@@ -1447,6 +1518,28 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
+{
+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
+ if (ret)
+ return ret;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
+ ptrs.start->flags.flags = flags;
+ } else {
+ struct bch_extent_flags f = {
+ .type = BIT(BCH_EXTENT_ENTRY_flags),
+ .flags = flags,
+ };
+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1492,8 +1585,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- break;
case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_flags:
break;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 204d765dd74c..e78a39e7e18f 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
- (_ptr).idx = 0; \
- (_ptr).has_ec = false; \
+ (_ptr).has_ec = false; \
+ (_ptr).do_ec_reconstruct = false; \
+ (_ptr).crc_retry_nr = 0; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
@@ -401,10 +402,10 @@ out: \
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
- struct extent_ptr_decoded *);
+ struct extent_ptr_decoded *, bool);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
- struct extent_ptr_decoded *);
+ struct extent_ptr_decoded *, int);
/* KEY_TYPE_btree_ptr: */
@@ -753,4 +754,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
+{
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
+ return ptrs.start->flags.flags;
+ return 0;
+}
+
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
+{
+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
+}
+
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
+
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index c198dfc376d6..74c0252cbd98 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -79,8 +79,9 @@
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
+ x(rebalance, 5) \
+ x(flags, 6)
+#define BCH_EXTENT_ENTRY_MAX 7
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
#endif
};
+#define BCH_EXTENT_FLAGS() \
+ x(poisoned, 0)
+
+enum bch_extent_flags_e {
+#define x(n, v) BCH_EXTENT_FLAG_##n = v,
+ BCH_EXTENT_FLAGS()
+#undef x
+};
+
+struct bch_extent_flags {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:7,
+ flags:57;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 flags:57,
+ type:7;
+#endif
+};
+
/* bch_extent_rebalance: */
#include "rebalance_format.h"
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 43d6c341ecca..e51529dca4c2 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
- unsigned idx;
bool has_ec;
+ bool do_ec_reconstruct;
+ u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
@@ -31,10 +32,10 @@ struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
- u8 idx;
- u8 nr_failed;
- u8 nr_retries;
- } devs[BCH_REPLICAS_MAX];
+ unsigned failed_csum_nr:6,
+ failed_io:1,
+ failed_ec:1;
+ } devs[BCH_REPLICAS_MAX + 1];
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
index 2eaffe37b5e7..0e742555cb0a 100644
--- a/fs/bcachefs/eytzinger.c
+++ b/fs/bcachefs/eytzinger.c
@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr
return cmp(a, b, priv);
}
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
- return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
+ base1 + inorder_to_eytzinger1(r, n) * size,
cmp_func, priv);
}
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
- do_swap(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
+ base1 + inorder_to_eytzinger1(r, n) * size,
size, swap_func, priv);
}
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
+static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
{
- int i, j, k;
+ unsigned i, j, k;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
- if (is_aligned(base, size, 8))
+ if (is_aligned(base1, size, 8))
swap_func = SWAP_WORDS_64;
- else if (is_aligned(base, size, 4))
+ else if (is_aligned(base1, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
+ for (i = n / 2; i >= 1; --i) {
/* Find the sift-down path all the way to the leaves. */
- for (j = i; k = j * 2 + 1, k + 1 < n;)
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+ for (j = i; k = j * 2, k < n;)
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
- if (j * 2 + 2 == n)
- j = j * 2 + 1;
+ if (j * 2 == n)
+ j *= 2;
/* Backtrack to the correct location. */
- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
- j = (j - 1) / 2;
+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
+ j /= 2;
/* Shift the element into its correct place. */
for (k = j; j != i;) {
- j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ j /= 2;
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
/* sort */
- for (i = n - 1; i > 0; --i) {
- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+ for (i = n; i > 1; --i) {
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
/* Find the sift-down path all the way to the leaves. */
- for (j = 0; k = j * 2 + 1, k + 1 < i;)
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+ for (j = 1; k = j * 2, k + 1 < i;)
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
- if (j * 2 + 2 == i)
- j = j * 2 + 1;
+ if (j * 2 + 1 == i)
+ j *= 2;
/* Backtrack to the correct location. */
- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
- j = (j - 1) / 2;
+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
+ j /= 2;
/* Shift the element into its correct place. */
- for (k = j; j;) {
- j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ for (k = j; j > 1;) {
+ j /= 2;
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
}
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ void *base1 = base - size;
+
+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
+}
+
void eytzinger0_sort(void *base, size_t n, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 0541192d7bc0..643c1f716061 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -6,6 +6,7 @@
#include <linux/log2.h>
#ifdef EYTZINGER_DEBUG
+#include <linux/bug.h>
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size)
return rounddown_pow_of_two(size + 1) - 1;
}
-/*
- * eytzinger1_next() and eytzinger1_prev() have the nice properties that
- *
- * eytzinger1_next(0) == eytzinger1_first())
- * eytzinger1_prev(0) == eytzinger1_last())
- *
- * eytzinger1_prev(eytzinger1_first()) == 0
- * eytzinger1_next(eytzinger1_last()) == 0
- */
-
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EYTZINGER_BUG_ON(i > size);
+ EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
- i <<= __fls(size + 1) - __fls(i);
+ i <<= __fls(size) - __fls(i);
i >>= i > size;
} else {
i >>= ffz(i) + 1;
@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EYTZINGER_BUG_ON(i > size);
+ EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
- i <<= __fls(size + 1) - __fls(i);
+ i <<= __fls(size) - __fls(i);
i -= 1;
i >>= i > size;
} else {
@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
+#define eytzinger0_for_each_prev(_i, _size) \
+ for (unsigned (_i) = eytzinger0_last((_size)); \
+ (_i) != -1; \
+ (_i) = eytzinger0_prev((_i), (_size)))
+
/* return greatest node <= @search, or -1 if not found */
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- unsigned i, n = 0;
-
- if (!nr)
- return -1;
-
- do {
- i = n;
- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
- } while (n < nr);
-
- if (n & 1) {
- /*
- * @i was greater than @search, return previous node:
- *
- * if @i was leftmost/smallest element,
- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
- */
- return eytzinger0_prev(i, nr);
- } else {
- return i;
- }
+ void *base1 = base - size;
+ unsigned n = 1;
+
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
+ n >>= __ffs(n) + 1;
+ return n - 1;
}
+/* return smallest node > @search, or -1 if not found */
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+ void *base1 = base - size;
+ unsigned n = 1;
- /*
- * if eytitzinger0_find_le() returned -1 - no element was <= search - we
- * want to return the first element; next/prev identities mean this work
- * as expected
- *
- * similarly if find_le() returns last element, we should return -1;
- * identities mean this all works out:
- */
- return eytzinger0_next(idx, nr);
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
+ n >>= __ffs(n + 1) + 1;
+ return n - 1;
}
+/* return smallest node >= @search, or -1 if not found */
static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
-
- if (idx < nr && !cmp(base + idx * size, search))
- return idx;
+ void *base1 = base - size;
+ unsigned n = 1;
- return eytzinger0_next(idx, nr);
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
+ n >>= __ffs(n + 1) + 1;
+ return n - 1;
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
- void *_base = (base); \
+ size_t _size = (size); \
+ void *_base1 = (void *)(base) - _size; \
const void *_search = (search); \
size_t _nr = (nr); \
- size_t _size = (size); \
- size_t _i = 0; \
+ size_t _i = 1; \
int _res; \
\
- while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size))) \
- _i = eytzinger0_child(_i, _res > 0); \
- _i; \
+ while (_i <= _nr && \
+ (_res = _cmp(_search, _base1 + _i * _size))) \
+ _i = eytzinger1_child(_i, _res > 0); \
+ _i - 1; \
})
void eytzinger0_sort_r(void *, size_t, size_t,
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index ab1d5db2fa56..5ab1c73c8d4c 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans,
if (!get_more)
break;
+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
+
+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
+ break;
+
+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
+
+ /* ensure proper alignment */
+ order = min(order, __ffs(folio_offset|BIT(31)));
+
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
if (!folio)
break;
@@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
- int flags = BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE;
+ int flags = BCH_READ_retry_if_stale|
+ BCH_READ_may_promote;
int ret = 0;
- rbio->c = c;
- rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
@@ -211,14 +219,14 @@ static void bchfs_read(struct btree_trans *trans,
swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_last_fragment;
bch2_bio_page_state_set(&rbio->bio, k);
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if (flags & BCH_READ_last_fragment)
break;
swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -232,7 +240,8 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
prt_printf(&buf, "read error %i from btree lookup", ret);
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
@@ -280,12 +289,13 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_read_bio *rbio =
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_readpages_end_io);
readpage_iter_advance(&readpages_iter);
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
bchfs_read(trans, rbio, inode_inum(inode),
@@ -323,10 +333,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_read_single_folio_end_io);
rbio->bio.bi_private = &done;
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
@@ -420,7 +430,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
}
}
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+ if (io->op.flags & BCH_WRITE_wrote_data_inline) {
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 2089c36b5866..535bc5fcbcc0 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -73,6 +73,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct blk_plug plug;
loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
+ bool split = false;
size_t shorten;
ssize_t ret;
@@ -99,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
GFP_KERNEL,
&c->dio_read_bioset);
- bio->bi_end_io = bch2_direct_IO_read_endio;
-
dio = container_of(bio, struct dio_read, rbio.bio);
closure_init(&dio->cl, NULL);
@@ -133,12 +132,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
goto start;
while (iter->count) {
+ split = true;
+
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
REQ_OP_READ,
GFP_KERNEL,
&c->bio_read);
- bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
bio->bi_iter.bi_sector = offset >> 9;
@@ -160,7 +160,15 @@ start:
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+ struct bch_read_bio *rbio =
+ rbio_init(bio,
+ c,
+ opts,
+ split
+ ? bch2_direct_IO_read_split_endio
+ : bch2_direct_IO_read_endio);
+
+ bch2_read(c, rbio, inode_inum(inode));
}
blk_finish_plug(&plug);
@@ -511,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
if (sync)
- dio->op.flags |= BCH_WRITE_SYNC;
- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+ dio->op.flags |= BCH_WRITE_sync;
+ dio->op.flags |= BCH_WRITE_check_enospc;
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
bio_sectors(bio), true);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 15725b4ce393..f45054cee746 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -5,8 +5,8 @@
#include "chardev.h"
#include "dirent.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-ioctl.h"
+#include "namei.h"
#include "quota.h"
#include <linux/compat.h>
@@ -54,6 +54,32 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
(newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
+ if ((newflags ^ oldflags) & BCH_INODE_casefolded) {
+#ifdef CONFIG_UNICODE
+ int ret = 0;
+ /* Not supported on individual files. */
+ if (!S_ISDIR(bi->bi_mode))
+ return -EOPNOTSUPP;
+
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ ret = bch2_empty_dir_trans(trans, inode_inum(inode));
+ if (ret < 0)
+ return ret;
+
+ ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
+ if (ret)
+ return ret;
+
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
+#else
+ printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
+ return -EOPNOTSUPP;
+#endif
+ }
+
if (s->set_projinherit) {
bi->bi_fields_set &= ~(1 << Inode_opt_project);
bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
@@ -218,7 +244,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
int ret = 0;
subvol_inum inum;
- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
if (!kname)
return -ENOMEM;
@@ -511,10 +537,6 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
ret = -EXDEV;
goto err;
}
- if (!d_is_positive(victim)) {
- ret = -ENOENT;
- goto err;
- }
ret = __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index d30f9bb056fd..ecd3bfdcde21 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -6,19 +6,21 @@
/* bcachefs inode flags -> vfs inode flags: */
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_sync] = S_SYNC,
- [__BCH_INODE_immutable] = S_IMMUTABLE,
- [__BCH_INODE_append] = S_APPEND,
- [__BCH_INODE_noatime] = S_NOATIME,
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
+ [__BCH_INODE_casefolded] = S_CASEFOLD,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_sync] = FS_SYNC_FL,
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
- [__BCH_INODE_append] = FS_APPEND_FL,
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
+ [__BCH_INODE_casefolded] = FS_CASEFOLD_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 90ade8f648d9..c88c149d5de5 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -11,7 +11,6 @@
#include "errcode.h"
#include "extents.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-io.h"
#include "fs-ioctl.h"
#include "fs-io-buffered.h"
@@ -22,6 +21,7 @@
#include "io_read.h"
#include "journal.h"
#include "keylist.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "snapshot.h"
@@ -641,7 +641,9 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -651,30 +653,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (inode)
goto out;
+ /*
+ * Note: if check/repair needs it, we commit before
+ * bch2_inode_hash_init_insert(), as after that point we can't take a
+ * restart - not in the top level loop with a commit_do(), like we
+ * usually do:
+ */
+
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+ /*
+ * don't remove it: check_inodes might find another inode that points
+ * back to this dirent
+ */
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
c, "dirent to missing inode:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
if (ret)
goto err;
-
- /* regular files may have hardlinks: */
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
- c,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, &inode_u),
- buf.buf))) {
- ret = -ENOENT;
- goto err;
- }
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -698,6 +700,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
if (IS_ERR(inode))
inode = NULL;
+#ifdef CONFIG_UNICODE
+ if (!inode && IS_CASEFOLDED(vdir)) {
+ /*
+ * Do not cache a negative dentry in casefolded directories
+ * as it would need to be invalidated in the following situation:
+ * - Lookup file "blAH" in a casefolded directory
+ * - Creation of file "BLAH" in a casefolded directory
+ * - Lookup file "blAH" in a casefolded directory
+ * which would fail if we had a negative dentry.
+ *
+ * We should come back to this when VFS has a method to handle
+ * this edgecase.
+ */
+ return NULL;
+ }
+#endif
+
return d_splice_alias(&inode->v, dentry);
}
@@ -858,10 +877,10 @@ err:
return bch2_err_class(ret);
}
-static int bch2_mkdir(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry, umode_t mode)
+static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+ return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
}
static int bch2_rename2(struct mnt_idmap *idmap,
@@ -1802,7 +1821,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
break;
}
- mapping_set_large_folios(inode->v.i_mapping);
+ mapping_set_folio_min_order(inode->v.i_mapping,
+ get_order(trans->c->opts.block_size));
}
static void bch2_free_inode(struct inode *vinode)
@@ -2008,44 +2028,6 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static int bch2_remount(struct super_block *sb, int *flags,
- struct bch_opts opts)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret = 0;
-
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
- if (opts.read_only != c->opts.read_only) {
- down_write(&c->state_lock);
-
- if (opts.read_only) {
- bch2_fs_read_only(c);
-
- sb->s_flags |= SB_RDONLY;
- } else {
- ret = bch2_fs_read_write(c);
- if (ret) {
- bch_err(c, "error going rw: %i", ret);
- up_write(&c->state_lock);
- ret = -EINVAL;
- goto err;
- }
-
- sb->s_flags &= ~SB_RDONLY;
- }
-
- c->opts.read_only = opts.read_only;
-
- up_write(&c->state_lock);
- }
-
- if (opt_defined(opts, errors))
- c->opts.errors = opts.errors;
-err:
- return bch2_err_class(ret);
-}
-
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
@@ -2192,6 +2174,9 @@ static int bch2_fs_get_tree(struct fs_context *fc)
if (ret)
goto err;
+ if (opt_defined(opts, discard))
+ set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
+
/* Some options can't be parsed until after the fs is started: */
opts = bch2_opts_empty();
ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
@@ -2200,9 +2185,10 @@ static int bch2_fs_get_tree(struct fs_context *fc)
bch2_opts_apply(&c->opts, opts);
- ret = bch2_fs_start(c);
- if (ret)
- goto err_stop_fs;
+ /*
+ * need to initialise sb and set c->vfs_sb _before_ starting fs,
+ * for blk_holder_ops
+ */
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
@@ -2264,6 +2250,10 @@ got_sb:
sb->s_shrink->seeks = 0;
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err_put_super;
+
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");
@@ -2351,8 +2341,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct bch2_opts_parse *opts = fc->fs_private;
+ struct bch_fs *c = sb->s_fs_info;
+ int ret = 0;
+
+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
- return bch2_remount(sb, &fc->sb_flags, opts->opts);
+ if (opts->opts.read_only != c->opts.read_only) {
+ down_write(&c->state_lock);
+
+ if (opts->opts.read_only) {
+ bch2_fs_read_only(c);
+
+ sb->s_flags |= SB_RDONLY;
+ } else {
+ ret = bch2_fs_read_write(c);
+ if (ret) {
+ bch_err(c, "error going rw: %i", ret);
+ up_write(&c->state_lock);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ sb->s_flags &= ~SB_RDONLY;
+ }
+
+ c->opts.read_only = opts->opts.read_only;
+
+ up_write(&c->state_lock);
+ }
+
+ if (opt_defined(opts->opts, errors))
+ c->opts.errors = opts->opts.errors;
+err:
+ return bch2_err_class(ret);
}
static const struct fs_context_operations bch2_context_ops = {
@@ -2396,7 +2417,7 @@ static struct file_system_type bcache_fs_type = {
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
};
MODULE_ALIAS_FS("bcachefs");
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0e85131d0af8..091057023fc5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -10,10 +10,10 @@
#include "dirent.h"
#include "error.h"
#include "fs.h"
-#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
@@ -23,13 +23,6 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
@@ -116,29 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode_nr)
- break;
- if (!bkey_is_inode(k.k))
- continue;
- ret = bch2_inode_unpack(k, inode);
- goto found;
- }
- ret = -BCH_ERR_ENOENT_inode;
-found:
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
struct bch_inode_unpacked *inode)
{
@@ -179,32 +149,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- int ret;
-
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
/*
* Find any subvolume associated with a tree of snapshots
* We can't rely on master_subvol - it might have been deleted.
@@ -548,7 +492,7 @@ static int remove_backpointer(struct btree_trans *trans,
SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
int ret = bkey_err(d) ?:
dirent_points_to_inode(c, d, inode) ?:
- __remove_dirent(trans, d.k->p);
+ bch2_fsck_remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1985,169 +1929,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa
trans_was_restarted(trans, restart_count);
}
-noinline_for_stack
-static int check_dirent_inode_dirent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
- int ret = 0;
-
- if (inode_points_to_dirent(target, d))
- return 0;
-
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
- fsck_err_on(S_ISDIR(target->bi_mode),
- trans, inode_dir_missing_backpointer,
- "directory with missing backpointer\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- target->bi_flags &= ~BCH_INODE_unlinked;
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target);
- }
-
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
-
- if (fsck_err_on(!backpointer_exists,
- trans, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target->bi_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target);
- goto out;
- }
-
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
- if (fsck_err_on(backpointer_exists &&
- (S_ISDIR(target->bi_mode) ||
- target->bi_subvol),
- trans, inode_dir_multiple_links,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
-
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- trans, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-noinline_for_stack
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = check_dirent_inode_dirent(trans, iter, d, target);
- if (ret)
- goto err;
-
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
- trans, dirent_d_type_wrong,
- "incorrect d_type: got %s, should be %s:\n%s",
- bch2_d_type_str(d.v->d_type),
- bch2_d_type_str(inode_d_type(target)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = inode_d_type(target);
- if (n->v.d_type == DT_SUBVOL) {
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
- } else {
- n->v.d_inum = cpu_to_le64(target->bi_inum);
- }
-
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
- if (ret)
- goto err;
-
- d = dirent_i_to_s_c(n);
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
/* find a subvolume that's a descendent of @snapshot: */
static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
{
@@ -2247,7 +2028,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (fsck_err(trans, dirent_to_missing_subvol,
"dirent points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
- return __remove_dirent(trans, d.k->p);
+ return bch2_fsck_remove_dirent(trans, d.k->p);
ret = 0;
goto out;
}
@@ -2291,7 +2072,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
goto err;
}
- ret = check_dirent_target(trans, iter, d, &subvol_root);
+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
if (ret)
goto err;
out:
@@ -2378,13 +2159,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- ret = __remove_dirent(trans, d.k->p);
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
if (ret)
goto err;
}
darray_for_each(target->inodes, i) {
- ret = check_dirent_target(trans, iter, d, &i->inode);
+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
if (ret)
goto err;
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 339b80770f1d..80051073f613 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -731,10 +731,9 @@ int bch2_trigger_inode(struct btree_trans *trans,
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
if (ret)
return ret;
}
@@ -868,19 +867,6 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid, gid, mode, rdev, parent);
}
-static inline u32 bkey_generation(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- BUG();
- case KEY_TYPE_inode_generation:
- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
- default:
- return 0;
- }
-}
-
static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
@@ -1092,7 +1078,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
- ret = -EIO;
+ ret = -BCH_ERR_ENOENT_inode;
goto err;
}
@@ -1256,7 +1242,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
- ret = -EIO;
+ ret = -BCH_ERR_ENOENT_inode;
goto err;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 428b9be6af34..f82cfbf460d0 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -277,6 +277,7 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i
bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
return S_ISDIR(inode->bi_mode) ||
+ inode->bi_subvol ||
(!inode->bi_nlink && inode_has_bp);
}
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index b99a5bf1a75e..117110af1e3f 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -137,7 +137,8 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9)
+ x(has_child_snapshot, 9) \
+ x(casefolded, 10)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 5353979117b0..6b842c8d21be 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -115,7 +115,8 @@ err:
bch2_increment_clock(c, sectors_allocated, WRITE);
if (should_print_err(ret)) {
struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index aa91fcf51eec..f1503df57dc7 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -25,8 +25,15 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -80,6 +87,7 @@ struct promote_op {
struct rhash_head hash;
struct bpos pos;
+ struct work_struct work;
struct data_update write;
struct bio_vec bi_inline_vecs[]; /* must be last */
};
@@ -96,6 +104,33 @@ static inline bool have_io_error(struct bch_io_failures *failed)
return failed && failed->nr;
}
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
+{
+ EBUG_ON(rbio->split);
+
+ return rbio->data_update
+ ? container_of(rbio, struct data_update, rbio)
+ : NULL;
+}
+
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
+{
+ struct data_update *u = rbio_data_update(orig);
+ if (!u)
+ return false;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev &&
+ u->data_opts.rewrite_ptrs & BIT(i))
+ return true;
+ i++;
+ }
+
+ return false;
+}
+
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
@@ -105,7 +140,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
if (!have_io_error(failed)) {
BUG_ON(!opts.promote_target);
- if (!(flags & BCH_READ_MAY_PROMOTE))
+ if (!(flags & BCH_READ_may_promote))
return -BCH_ERR_nopromote_may_not;
if (bch2_bkey_has_target(c, k, opts.promote_target))
@@ -125,98 +160,93 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static void promote_free(struct bch_fs *c, struct promote_op *op)
+static noinline void promote_free(struct bch_read_bio *rbio)
{
- int ret;
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
+ struct bch_fs *c = rbio->c;
+
+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
bch2_data_update_exit(&op->write);
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
static void promote_done(struct bch_write_op *wop)
{
- struct promote_op *op =
- container_of(wop, struct promote_op, write.op);
- struct bch_fs *c = op->write.op.c;
+ struct promote_op *op = container_of(wop, struct promote_op, write.op);
+ struct bch_fs *c = op->write.rbio.c;
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
- op->start_time);
- promote_free(c, op);
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
+ promote_free(&op->write.rbio);
}
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+static void promote_start_work(struct work_struct *work)
{
- struct bio *bio = &op->write.op.wbio.bio;
+ struct promote_op *op = container_of(work, struct promote_op, work);
- trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+ bch2_data_update_read_done(&op->write);
+}
- /* we now own pages: */
- BUG_ON(!rbio->bounce);
- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+static noinline void promote_start(struct bch_read_bio *rbio)
+{
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
- swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
- bch2_data_update_read_done(&op->write, rbio->pick.crc);
+ INIT_WORK(&op->work, promote_start_work);
+ queue_work(rbio->c->write_ref_wq, &op->work);
}
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct bpos pos,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned sectors,
- struct bch_read_bio **rbio,
- struct bch_io_failures *failed)
+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bkey_s_c k,
+ struct bpos pos,
+ struct extent_ptr_decoded *pick,
+ unsigned sectors,
+ struct bch_read_bio *orig,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
- struct promote_op *op = NULL;
- struct bio *bio;
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
- if (!op) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err;
- }
+ if (!have_io_error(failed)) {
+ update_opts.target = orig->opts.promote_target;
+ update_opts.extra_replicas = 1;
+ update_opts.write_flags |= BCH_WRITE_cached;
+ update_opts.write_flags |= BCH_WRITE_only_specified_devs;
+ } else {
+ update_opts.target = orig->opts.foreground_target;
- op->start_time = local_clock();
- op->pos = pos;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (bch2_dev_io_failures(failed, ptr->dev) &&
+ !ptr_being_rewritten(orig, ptr->dev))
+ update_opts.rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
- /*
- * We don't use the mempool here because extents that aren't
- * checksummed or compressed can be too big for the mempool:
- */
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
- sizeof(struct bio_vec) * pages,
- GFP_KERNEL);
- if (!*rbio) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err;
+ if (!update_opts.rewrite_ptrs)
+ return NULL;
}
- rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op) {
ret = -BCH_ERR_nopromote_enomem;
- goto err;
+ goto err_put;
}
- (*rbio)->bounce = true;
- (*rbio)->split = true;
- (*rbio)->kmalloc = true;
+ op->start_time = local_clock();
+ op->pos = pos;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params)) {
@@ -224,64 +254,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
goto err;
}
- bio = &op->write.op.wbio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
- struct data_update_opts update_opts = {};
-
- if (!have_io_error(failed)) {
- update_opts.target = opts.promote_target;
- update_opts.extra_replicas = 1;
- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
- } else {
- update_opts.target = opts.foreground_target;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
- bkey_for_each_ptr(ptrs, ptr) {
- if (bch2_dev_io_failures(failed, ptr->dev))
- update_opts.rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
- }
-
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
- opts,
+ &orig->opts,
update_opts,
btree_id, k);
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
- if (ret) {
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params));
- goto err;
- }
+ if (ret)
+ goto err_remove_hash;
+ rbio_init_fragment(&op->write.rbio.bio, orig);
+ op->write.rbio.bounce = true;
+ op->write.rbio.promote = true;
op->write.op.end_io = promote_done;
- return op;
+ return &op->write.rbio;
+err_remove_hash:
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params));
err:
- if (*rbio)
- bio_free_pages(&(*rbio)->bio);
- kfree(*rbio);
- *rbio = NULL;
+ bio_free_pages(&op->write.op.wbio.bio);
/* We may have added to the rhashtable and thus need rcu freeing: */
kfree_rcu(op, rcu);
+err_put:
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
+static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
struct bvec_iter iter,
struct bkey_s_c k,
struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
unsigned flags,
- struct bch_read_bio **rbio,
+ struct bch_read_bio *orig,
bool *bounce,
bool *read_full,
struct bch_io_failures *failed)
@@ -301,18 +310,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
struct bpos pos = promote_full
? bkey_start_pos(k.k)
: POS(k.k->p.inode, iter.bi_sector);
- struct promote_op *promote;
int ret;
- ret = should_promote(c, k, pos, opts, flags, failed);
+ ret = should_promote(c, k, pos, orig->opts, flags, failed);
if (ret)
goto nopromote;
- promote = __promote_alloc(trans,
- k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_reflink
- : BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio, failed);
+ struct bch_read_bio *promote =
+ __promote_alloc(trans,
+ k.k->type == KEY_TYPE_reflink_v
+ ? BTREE_ID_reflink
+ : BTREE_ID_extents,
+ k, pos, pick, sectors, orig, failed);
+ if (!promote)
+ return NULL;
+
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
@@ -321,7 +333,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
*read_full = promote_full;
return promote;
nopromote:
- trace_read_nopromote(c, ret);
+ trace_io_read_nopromote(c, ret);
return NULL;
}
@@ -330,9 +342,17 @@ nopromote:
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_read_bio *rbio, struct bpos read_pos)
{
- return bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { rbio->subvol, read_pos.inode },
- read_pos.offset << 9);
+ int ret = lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { rbio->subvol, read_pos.inode },
+ read_pos.offset << 9));
+ if (ret)
+ return ret;
+
+ if (rbio->data_update)
+ prt_str(out, "(internal move) ");
+
+ return 0;
}
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@@ -341,10 +361,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
}
-#define READ_RETRY_AVOID 1
-#define READ_RETRY 2
-#define READ_ERR 3
-
enum rbio_context {
RBIO_CONTEXT_NULL,
RBIO_CONTEXT_HIGHPRI,
@@ -375,20 +391,25 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
BUG_ON(rbio->bounce && !rbio->split);
- if (rbio->promote)
- promote_free(rbio->c, rbio->promote);
- rbio->promote = NULL;
-
- if (rbio->bounce)
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+ if (rbio->have_ioref) {
+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
+ percpu_ref_put(&ca->io_ref);
+ }
if (rbio->split) {
struct bch_read_bio *parent = rbio->parent;
- if (rbio->kmalloc)
- kfree(rbio);
- else
+ if (unlikely(rbio->promote)) {
+ if (!rbio->bio.bi_status)
+ promote_start(rbio);
+ else
+ promote_free(rbio);
+ } else {
+ if (rbio->bounce)
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
bio_put(&rbio->bio);
+ }
rbio = parent;
}
@@ -408,61 +429,49 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter,
+ struct bch_io_failures *failed,
+ unsigned flags)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_buf sk;
- struct bkey_s_c k;
- int ret;
-
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
-
- bch2_bkey_buf_init(&sk);
-
- bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_slots);
+ struct data_update *u = container_of(rbio, struct data_update, rbio);
retry:
bch2_trans_begin(trans);
- rbio->bio.bi_status = 0;
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
+ u->btree_id, bkey_start_pos(&u->k.k->k),
+ 0)));
if (ret)
goto err;
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- if (!bch2_bkey_matches_ptr(c, k,
- rbio->pick.ptr,
- rbio->data_pos.offset -
- rbio->pick.crc.offset)) {
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
- rbio->hole = true;
- goto out;
+ rbio->ret = -BCH_ERR_data_read_key_overwritten;
+ goto err;
}
ret = __bch2_read_extent(trans, rbio, bvec_iter,
- rbio->read_pos,
- rbio->data_btree,
- k, 0, failed, flags);
- if (ret == READ_RETRY)
- goto retry;
- if (ret)
- goto err;
-out:
- bch2_rbio_done(rbio);
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
- return;
+ bkey_start_pos(&u->k.k->k),
+ u->btree_id,
+ bkey_i_to_s_c(u->k.k),
+ 0, failed, flags, -1);
err:
- rbio->bio.bi_status = BLK_STS_IOERR;
- goto out;
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
+ goto retry;
+
+ if (ret) {
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
+ }
+
+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
+ return ret;
}
static void bch2_rbio_retry(struct work_struct *work)
@@ -477,45 +486,80 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
+ struct btree_trans *trans = bch2_trans_get(c);
- trace_and_count(c, read_retry, &rbio->bio);
+ trace_io_read_retry(&rbio->bio);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
+ bvec_iter_sectors(rbio->bvec_iter));
- if (rbio->retry == READ_RETRY_AVOID)
- bch2_mark_io_failure(&failed, &rbio->pick);
+ if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+ bch2_mark_io_failure(&failed, &rbio->pick,
+ rbio->ret == -BCH_ERR_data_read_retry_csum_err);
+
+ if (!rbio->split) {
+ rbio->bio.bi_status = 0;
+ rbio->ret = 0;
+ }
- rbio->bio.bi_status = 0;
+ unsigned subvol = rbio->subvol;
+ struct bpos read_pos = rbio->read_pos;
rbio = bch2_rbio_free(rbio);
- flags |= BCH_READ_IN_RETRY;
- flags &= ~BCH_READ_MAY_PROMOTE;
+ flags |= BCH_READ_in_retry;
+ flags &= ~BCH_READ_may_promote;
+ flags &= ~BCH_READ_last_fragment;
+ flags |= BCH_READ_must_clone;
- if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+ int ret = rbio->data_update
+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
+ : __bch2_read(trans, rbio, iter, inum, &failed, flags);
+
+ if (ret) {
+ rbio->ret = ret;
+ rbio->bio.bi_status = BLK_STS_IOERR;
} else {
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
+ struct printbuf buf = PRINTBUF;
+
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf,
+ (subvol_inum) { subvol, read_pos.inode },
+ read_pos.offset << 9));
+ if (rbio->data_update)
+ prt_str(&buf, "(internal move) ");
+ prt_str(&buf, "successful retry");
- __bch2_read(c, rbio, iter, inum, &failed, flags);
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
+
+ bch2_rbio_done(rbio);
+ bch2_trans_put(trans);
}
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
- blk_status_t error)
+static void bch2_rbio_error(struct bch_read_bio *rbio,
+ int ret, blk_status_t blk_error)
{
- rbio->retry = retry;
+ BUG_ON(ret >= 0);
- if (rbio->flags & BCH_READ_IN_RETRY)
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
+
+ bch2_rbio_parent(rbio)->saw_error = true;
+
+ if (rbio->flags & BCH_READ_in_retry)
return;
- if (retry == READ_ERR) {
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ } else {
rbio = bch2_rbio_free(rbio);
- rbio->bio.bi_status = error;
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
+
bch2_rbio_done(rbio);
- } else {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
}
}
@@ -531,15 +575,13 @@ static void bch2_read_io_err(struct work_struct *work)
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
+ if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
- } else {
+ else
bch_err_ratelimited(c, "%s", buf.buf);
- }
printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -621,14 +663,12 @@ static void bch2_read_csum_err(struct work_struct *work)
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+ if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
- } else {
+ else
bch_err_ratelimited(c, "%s", buf.buf);
- }
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -648,7 +688,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -668,7 +708,7 @@ static void bch2_read_decrypt_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -678,9 +718,11 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ struct bch_read_bio *parent = bch2_rbio_parent(rbio);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &parent->bio;
+ struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
@@ -698,8 +740,26 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
+
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
+
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
+ rbio->flags |= BCH_READ_must_bounce;
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
+ BLK_STS_IOERR);
+ goto out;
+ }
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good)
goto csum_err;
/*
@@ -712,32 +772,40 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
- if (rbio->flags & BCH_READ_NODECODE)
- goto nodecode;
+ if (likely(!parent->data_update)) {
+ /* Adjust crc to point to subset of data we want: */
+ crc.offset += rbio->offset_into_extent;
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- /* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->offset_into_extent;
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
+ if (crc_is_compressed(crc)) {
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
- if (crc_is_compressed(crc)) {
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+ !c->opts.no_data_io)
+ goto decompression_err;
+ } else {
+ /* don't need to decrypt the entire bio: */
+ nonce = nonce_add(nonce, crc.offset << 9);
+ bio_advance(src, crc.offset << 9);
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
- !c->opts.no_data_io)
- goto decompression_err;
- } else {
- /* don't need to decrypt the entire bio: */
- nonce = nonce_add(nonce, crc.offset << 9);
- bio_advance(src, crc.offset << 9);
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
+ if (rbio->bounce) {
+ struct bvec_iter src_iter = src->bi_iter;
+
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ }
+ }
+ } else {
+ if (rbio->split)
+ rbio->parent->pick = rbio->pick;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
@@ -754,12 +822,9 @@ static void __bch2_read_endio(struct work_struct *work)
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
-
- promote_start(rbio->promote, rbio);
- rbio->promote = NULL;
}
-nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+
+ if (likely(!(rbio->flags & BCH_READ_in_retry))) {
rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
}
@@ -767,17 +832,6 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
- goto out;
- }
-
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
goto out;
decompression_err:
@@ -797,10 +851,8 @@ static void bch2_read_endio(struct bio *bio)
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- if (rbio->have_ioref) {
- bch2_latency_acct(ca, rbio->submit_time, READ);
- percpu_ref_put(&ca->io_ref);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rbio->submit_time, !bio->bi_status);
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
@@ -810,14 +862,14 @@ static void bch2_read_endio(struct bio *bio)
return;
}
- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
- trace_and_count(c, read_reuse_race, &rbio->bio);
+ trace_and_count(c, io_read_reuse_race, &rbio->bio);
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+ if (rbio->flags & BCH_READ_retry_if_stale)
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
else
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
return;
}
@@ -883,15 +935,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
unsigned offset_into_extent,
- struct bch_io_failures *failed, unsigned flags)
+ struct bch_io_failures *failed, unsigned flags, int dev)
{
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
- int pick_ret;
+ struct data_update *u = rbio_data_update(orig);
+ int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
unsigned bytes = min_t(unsigned, iter.bi_size,
@@ -902,19 +954,21 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
swap(iter.bi_size, bytes);
bio_advance_iter(&orig->bio, &iter, bytes);
zero_fill_bio_iter(&orig->bio, iter);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
+ bvec_iter_sectors(iter));
goto out_read_done;
}
retry_pick:
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
/* hole or reservation - just zero fill: */
- if (!pick_ret)
+ if (!ret)
goto hole;
- if (unlikely(pick_ret < 0)) {
+ if (unlikely(ret < 0)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
+ prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
@@ -930,6 +984,7 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
+ ret = -BCH_ERR_data_read_no_encryption_key;
goto err;
}
@@ -941,56 +996,57 @@ retry_pick:
* retry path, don't check here, it'll be caught in bch2_read_endio()
* and we'll end up in the retry path:
*/
- if ((flags & BCH_READ_IN_RETRY) &&
+ if ((flags & BCH_READ_in_retry) &&
!pick.ptr.cached &&
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick);
+ bch2_mark_io_failure(failed, &pick, false);
percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
- if (flags & BCH_READ_NODECODE) {
+ if (likely(!u)) {
+ if (!(flags & BCH_READ_last_fragment) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_must_clone;
+
+ narrow_crcs = !(flags & BCH_READ_in_retry) &&
+ bch2_can_narrow_extent_crcs(k, pick.crc);
+
+ if (narrow_crcs && (flags & BCH_READ_user_mapped))
+ flags |= BCH_READ_must_bounce;
+
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+ if (crc_is_compressed(pick.crc) ||
+ (pick.crc.csum_type != BCH_CSUM_none &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+ (flags & BCH_READ_user_mapped)) ||
+ (flags & BCH_READ_must_bounce)))) {
+ read_full = true;
+ bounce = true;
+ }
+ } else {
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
percpu_ref_put(&ca->io_ref);
- goto hole;
+ rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+ goto out_read_done;
}
iter.bi_size = pick.crc.compressed_size << 9;
- goto get_bio;
- }
-
- if (!(flags & BCH_READ_LAST_FRAGMENT) ||
- bio_flagged(&orig->bio, BIO_CHAIN))
- flags |= BCH_READ_MUST_CLONE;
-
- narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(k, pick.crc);
-
- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
- flags |= BCH_READ_MUST_BOUNCE;
-
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
- if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_none &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
- (flags & BCH_READ_USER_MAPPED)) ||
- (flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
- bounce = true;
}
if (orig->opts.promote_target || have_io_error(failed))
- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full, failed);
+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
+ &bounce, &read_full, failed);
if (!read_full) {
EBUG_ON(crc_is_compressed(pick.crc));
@@ -1009,7 +1065,7 @@ retry_pick:
pick.crc.offset = 0;
pick.crc.live_size = bvec_iter_sectors(iter);
}
-get_bio:
+
if (rbio) {
/*
* promote already allocated bounce rbio:
@@ -1024,17 +1080,16 @@ get_bio:
} else if (bounce) {
unsigned sectors = pick.crc.compressed_size;
- rbio = rbio_init(bio_alloc_bioset(NULL,
+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
0,
GFP_NOFS,
&c->bio_read_split),
- orig->opts);
+ orig);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
rbio->bounce = true;
- rbio->split = true;
- } else if (flags & BCH_READ_MUST_CLONE) {
+ } else if (flags & BCH_READ_must_clone) {
/*
* Have to clone if there were any splits, due to error
* reporting issues (if a split errored, and retrying didn't
@@ -1043,11 +1098,10 @@ get_bio:
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
&c->bio_read_split),
- orig->opts);
+ orig);
rbio->bio.bi_iter = iter;
- rbio->split = true;
} else {
rbio = orig;
rbio->bio.bi_iter = iter;
@@ -1056,67 +1110,60 @@ get_bio:
EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
- rbio->c = c;
rbio->submit_time = local_clock();
- if (rbio->split)
- rbio->parent = orig;
- else
+ if (!rbio->split)
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
- rbio->hole = 0;
- rbio->retry = 0;
+ rbio->ret = 0;
rbio->context = 0;
- /* XXX: only initialize this if needed */
- rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
rbio->version = k.k->bversion;
- rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
- if (flags & BCH_READ_NODECODE)
- orig->pick = pick;
-
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
- trace_and_count(c, read_bounce, &rbio->bio);
+ trace_and_count(c, io_read_bounce, &rbio->bio);
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+ if (!u)
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+ else
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/*
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
bio_inc_remaining(&orig->bio);
- trace_and_count(c, read_split, &orig->bio);
+ trace_and_count(c, io_read_split, &orig->bio);
}
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
- if (!(flags & BCH_READ_IN_RETRY))
+ if (!(flags & BCH_READ_in_retry))
bch2_trans_unlock(trans);
else
bch2_trans_unlock_long(trans);
- if (!rbio->pick.idx) {
+ if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1126,7 +1173,9 @@ get_bio:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio,
+ -BCH_ERR_data_read_retry_device_offline,
+ BLK_STS_IOERR);
goto out;
}
@@ -1135,10 +1184,10 @@ get_bio:
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
if (unlikely(c->opts.no_data_io)) {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
bio_endio(&rbio->bio);
} else {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
submit_bio(&rbio->bio);
else
submit_bio_wait(&rbio->bio);
@@ -1152,15 +1201,16 @@ get_bio:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
+ BLK_STS_IOERR);
goto out;
}
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
bio_endio(&rbio->bio);
}
out:
- if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ if (likely(!(flags & BCH_READ_in_retry))) {
return 0;
} else {
bch2_trans_unlock(trans);
@@ -1170,54 +1220,54 @@ out:
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
- ret = rbio->retry;
+ ret = rbio->ret;
rbio = bch2_rbio_free(rbio);
- if (ret == READ_RETRY_AVOID) {
- bch2_mark_io_failure(failed, &pick);
- ret = READ_RETRY;
- }
-
- if (!ret)
- goto out_read_done;
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
+ bch2_mark_io_failure(failed, &pick,
+ ret == -BCH_ERR_data_read_retry_csum_err);
return ret;
}
err:
- if (flags & BCH_READ_IN_RETRY)
- return READ_ERR;
+ if (flags & BCH_READ_in_retry)
+ return ret;
- orig->bio.bi_status = BLK_STS_IOERR;
+ orig->bio.bi_status = BLK_STS_IOERR;
+ orig->ret = ret;
goto out_read_done;
hole:
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
+ bvec_iter_sectors(iter));
/*
- * won't normally happen in the BCH_READ_NODECODE
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
- * to read no longer exists we have to signal that:
+ * won't normally happen in the data update (bch2_move_extent()) path,
+ * but if we retry and the extent we wanted to read no longer exists we
+ * have to signal that:
*/
- if (flags & BCH_READ_NODECODE)
- orig->hole = true;
+ if (u)
+ orig->ret = -BCH_ERR_data_read_key_overwritten;
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if ((flags & BCH_READ_last_fragment) &&
+ !(flags & BCH_READ_in_retry))
bch2_rbio_done(orig);
return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, subvol_inum inum,
+ struct bch_io_failures *failed, unsigned flags)
{
- struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
int ret;
- BUG_ON(flags & BCH_READ_NODECODE);
+ EBUG_ON(rbio->data_update);
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@@ -1267,24 +1317,26 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_last_fragment;
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
data_btree, k,
- offset_into_extent, failed, flags);
+ offset_into_extent, failed, flags, -1);
if (ret)
goto err;
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if (flags & BCH_READ_last_fragment)
break;
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err:
+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
+ flags |= BCH_READ_must_bounce;
+
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- ret != READ_RETRY &&
- ret != READ_RETRY_AVOID)
+ !bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
@@ -1292,17 +1344,22 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
- prt_printf(&buf, "read error %i from btree lookup", ret);
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
+ bvec_iter.bi_sector << 9));
+ prt_printf(&buf, "read error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- rbio->bio.bi_status = BLK_STS_IOERR;
- bch2_rbio_done(rbio);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
+
+ if (!(flags & BCH_READ_in_retry))
+ bch2_rbio_done(rbio);
}
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
+ return ret;
}
void bch2_fs_io_read_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index a82e8a94ccb6..cd21950417f6 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
+#include "btree_iter.h"
#include "reflink.h"
struct bch_read_bio {
@@ -35,19 +36,18 @@ struct bch_read_bio {
u16 flags;
union {
struct {
- u16 bounce:1,
+ u16 data_update:1,
+ promote:1,
+ bounce:1,
split:1,
- kmalloc:1,
have_ioref:1,
narrow_crcs:1,
- hole:1,
- retry:2,
+ saw_error:1,
context:2;
};
u16 _state;
};
-
- struct bch_devs_list devs_have;
+ s16 ret;
struct extent_ptr_decoded pick;
@@ -65,8 +65,6 @@ struct bch_read_bio {
struct bpos data_pos;
struct bversion version;
- struct promote_op *promote;
-
struct bch_io_opts opts;
struct work_struct work;
@@ -108,23 +106,31 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
return 0;
}
+#define BCH_READ_FLAGS() \
+ x(retry_if_stale) \
+ x(may_promote) \
+ x(user_mapped) \
+ x(last_fragment) \
+ x(must_bounce) \
+ x(must_clone) \
+ x(in_retry)
+
+enum __bch_read_flags {
+#define x(n) __BCH_READ_##n,
+ BCH_READ_FLAGS()
+#undef x
+};
+
enum bch_read_flags {
- BCH_READ_RETRY_IF_STALE = 1 << 0,
- BCH_READ_MAY_PROMOTE = 1 << 1,
- BCH_READ_USER_MAPPED = 1 << 2,
- BCH_READ_NODECODE = 1 << 3,
- BCH_READ_LAST_FRAGMENT = 1 << 4,
-
- /* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 5,
- BCH_READ_MUST_CLONE = 1 << 6,
- BCH_READ_IN_RETRY = 1 << 7,
+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
+ BCH_READ_FLAGS()
+#undef x
};
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
struct bvec_iter, struct bpos, enum btree_id,
struct bkey_s_c, unsigned,
- struct bch_io_failures *, unsigned);
+ struct bch_io_failures *, unsigned, int);
static inline void bch2_read_extent(struct btree_trans *trans,
struct bch_read_bio *rbio, struct bpos read_pos,
@@ -132,37 +138,55 @@ static inline void bch2_read_extent(struct btree_trans *trans,
unsigned offset_into_extent, unsigned flags)
{
__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags);
+ data_btree, k, offset_into_extent, NULL, flags, -1);
}
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
+ subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
{
- struct bch_io_failures failed = { .nr = 0 };
-
BUG_ON(rbio->_state);
- rbio->c = c;
- rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED);
+ bch2_trans_run(c,
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
+ BCH_READ_retry_if_stale|
+ BCH_READ_may_promote|
+ BCH_READ_user_mapped));
+}
+
+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
+ struct bch_read_bio *orig)
+{
+ struct bch_read_bio *rbio = to_rbio(bio);
+
+ rbio->c = orig->c;
+ rbio->_state = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
+ rbio->split = true;
+ rbio->parent = orig;
+ rbio->opts = orig->opts;
+ return rbio;
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
- struct bch_io_opts opts)
+ struct bch_fs *c,
+ struct bch_io_opts opts,
+ bio_end_io_t end_io)
{
struct bch_read_bio *rbio = to_rbio(bio);
- rbio->_state = 0;
- rbio->promote = NULL;
- rbio->opts = opts;
+ rbio->start_time = local_clock();
+ rbio->c = c;
+ rbio->_state = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
+ rbio->opts = opts;
+ rbio->bio.bi_end_io = end_io;
return rbio;
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 03892388832b..29671075e3f1 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -34,6 +34,12 @@
#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_write_corrupt_ratio;
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(write_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@@ -374,7 +380,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ op->flags & BCH_WRITE_check_enospc);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -396,29 +402,42 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
-static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
- u64 offset)
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
- bch2_inum_offset_err_msg(op->c, out,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9);
- prt_printf(out, "write error%s: ",
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
-}
+ struct printbuf buf = PRINTBUF;
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
-{
- __bch2_write_op_error(out, op, op->pos.offset);
+ if (op->subvol) {
+ bch2_inum_offset_err_msg(op->c, &buf,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ } else {
+ struct bpos pos = op->pos;
+ pos.offset = offset;
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
+ }
+
+ prt_str(&buf, "write error: ");
+
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+
+ if (op->flags & BCH_WRITE_move) {
+ struct data_update *u = container_of(op, struct data_update, op);
+
+ prt_printf(&buf, "\n from internal move ");
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
+ }
+
+ bch_err_ratelimited(op->c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
-static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
- struct bch_write_op *op, u64 offset)
+static void bch2_write_csum_err_msg(struct bch_write_op *op)
{
- bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9);
- prt_printf(out, "write error%s: ",
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+ bch2_write_op_error(op, op->pos.offset,
+ "error verifying existing checksum while rewriting existing data (memory corruption?)");
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -493,7 +512,7 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);
- if (!(op->flags & BCH_WRITE_MOVE))
+ if (!(op->flags & BCH_WRITE_move))
bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
@@ -516,7 +535,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -EIO;
+ return -BCH_ERR_data_write_io;
}
if (dst != src)
@@ -539,7 +558,7 @@ static void __bch2_write_index(struct bch_write_op *op)
unsigned dev;
int ret = 0;
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
ret = bch2_write_drop_io_error_ptrs(op);
if (ret)
goto err;
@@ -548,7 +567,7 @@ static void __bch2_write_index(struct bch_write_op *op)
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
- ret = !(op->flags & BCH_WRITE_MOVE)
+ ret = !(op->flags & BCH_WRITE_move)
? bch2_write_index_default(op)
: bch2_data_update_index_update(op);
@@ -560,11 +579,8 @@ static void __bch2_write_index(struct bch_write_op *op)
if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- struct printbuf buf = PRINTBUF;
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
}
if (ret)
@@ -573,21 +589,29 @@ static void __bch2_write_index(struct bch_write_op *op)
out:
/* If some a bucket wasn't written, we can't erasure code it: */
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
bch2_open_buckets_put(c, &op->open_buckets);
return;
err:
keys->top = keys->keys;
op->error = ret;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
goto out;
}
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
{
if (state != wp->state) {
+ struct task_struct *p = current;
u64 now = ktime_get_ns();
+ u64 runtime = p->se.sum_exec_runtime +
+ (now - p->se.exec_start);
+
+ if (state == WRITE_POINT_runnable)
+ wp->last_runtime = runtime;
+ else if (wp->state == WRITE_POINT_runnable)
+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
if (wp->last_state_change &&
time_after64(now, wp->last_state_change))
@@ -601,7 +625,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
{
enum write_point_state state;
- state = running ? WRITE_POINT_running :
+ state = running ? WRITE_POINT_runnable:
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
: WRITE_POINT_stopped;
@@ -615,8 +639,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
- if ((op->flags & BCH_WRITE_SUBMITTED) &&
- (op->flags & BCH_WRITE_MOVE))
+ if ((op->flags & BCH_WRITE_submitted) &&
+ (op->flags & BCH_WRITE_move))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
spin_lock_irqsave(&wp->writes_lock, flags);
@@ -654,11 +678,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
if (!op)
break;
- op->flags |= BCH_WRITE_IN_WORKER;
+ op->flags |= BCH_WRITE_in_worker;
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_SUBMITTED))
+ if (!(op->flags & BCH_WRITE_submitted))
__bch2_write(op);
else
bch2_write_done(&op->cl);
@@ -676,13 +700,17 @@ static void bch2_write_endio(struct bio *bio)
? bch2_dev_have_ref(c, wbio->dev)
: NULL;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_inum_offset_ratelimited(ca,
op->pos.inode,
wbio->inode_offset << 9,
"data write error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_IO_ERROR;
+ op->flags |= BCH_WRITE_io_error;
}
if (wbio->nocow) {
@@ -692,10 +720,8 @@ static void bch2_write_endio(struct bio *bio)
set_bit(wbio->dev, op->devs_need_flush->d);
}
- if (wbio->have_ioref) {
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
- }
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -729,7 +755,7 @@ static void init_append_extent(struct bch_write_op *op,
bch2_extent_crc_append(&e->k_i, crc);
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_CACHED);
+ op->flags & BCH_WRITE_cached);
bch2_keylist_push(&op->insert_keys);
}
@@ -789,7 +815,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
{
struct bio *bio = &op->wbio.bio;
struct bch_extent_crc_unpacked new_crc;
- int ret;
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
@@ -797,10 +822,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
bch2_csum_type_is_encryption(new_csum_type))
new_csum_type = op->crc.csum_type;
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
if (ret)
return ret;
@@ -810,44 +835,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
return 0;
}
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- struct bch_csum csum;
- int ret;
-
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
- return 0;
-
- /*
- * If we need to decrypt data in the write path, we'll no longer be able
- * to verify the existing checksum (poly1305 mac, in this case) after
- * it's decrypted - this is the last point we'll be able to reverify the
- * checksum:
- */
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return -EIO;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- return ret;
-}
-
-static enum prep_encoded_ret {
- PREP_ENCODED_OK,
- PREP_ENCODED_ERR,
- PREP_ENCODED_CHECKSUM_ERR,
- PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
struct bio *bio = &op->wbio.bio;
-
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
- return PREP_ENCODED_OK;
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ int ret = 0;
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
@@ -858,12 +851,13 @@ static enum prep_encoded_ret {
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ op->csum_type != op->crc.csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
- return PREP_ENCODED_DO_WRITE;
+ return 1;
}
/*
@@ -871,20 +865,23 @@ static enum prep_encoded_ret {
* is, we have to decompress it:
*/
if (crc_is_compressed(op->crc)) {
- struct bch_csum csum;
-
- if (bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
/* Last point we can still verify checksum: */
- csum = bch2_checksum_bio(c, op->crc.csum_type,
- extent_nonce(op->version, op->crc),
- bio);
+ struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ goto csum_err;
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
- if (bch2_bio_uncompress_inplace(op, bio))
- return PREP_ENCODED_ERR;
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
+
+ ret = bch2_bio_uncompress_inplace(op, bio);
+ if (ret)
+ return ret;
}
/*
@@ -896,22 +893,34 @@ static enum prep_encoded_ret {
* If the data is checksummed and we're only writing a subset,
* rechecksum and adjust bio to point to currently live data:
*/
- if ((op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
/*
* If we want to compress the data, it has to be decrypted:
*/
- if ((op->compression_opt ||
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(op->csum_type)) &&
- bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
+ struct bch_csum csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ goto csum_err;
+
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
- return PREP_ENCODED_OK;
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
+
+ return 0;
+csum_err:
+ bch2_write_csum_err_msg(op);
+ return -BCH_ERR_data_write_csum;
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -930,39 +939,44 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
ec_buf = bch2_writepoint_ec_buf(c, wp);
- switch (bch2_write_prep_encoded_data(op, wp)) {
- case PREP_ENCODED_OK:
- break;
- case PREP_ENCODED_ERR:
- ret = -EIO;
- goto err;
- case PREP_ENCODED_CHECKSUM_ERR:
- goto csum_err;
- case PREP_ENCODED_DO_WRITE:
- /* XXX look for bug here */
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
+ ret = bch2_write_prep_encoded_data(op, wp);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ if (ec_buf) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bio_copy_data(dst, src);
+ bounce = true;
+ }
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
}
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
}
if (ec_buf ||
op->compression_opt ||
(op->csum_type &&
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ !(op->flags & BCH_WRITE_pages_stable)) ||
(bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ !(op->flags & BCH_WRITE_pages_owned))) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
+ if (!bounce && write_corrupt_ratio) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bounce = true;
+ }
+#endif
saved_iter = dst->bi_iter;
do {
@@ -976,7 +990,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
break;
BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ (op->flags & BCH_WRITE_data_encoded) &&
bch2_csum_type_is_encryption(op->crc.csum_type));
BUG_ON(op->compression_opt && !bounce);
@@ -1014,7 +1028,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
}
}
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ if ((op->flags & BCH_WRITE_data_encoded) &&
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
@@ -1046,7 +1060,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.compression_type = compression_type;
crc.nonce = nonce;
} else {
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ if ((op->flags & BCH_WRITE_data_encoded) &&
bch2_rechecksum_bio(c, src, version, op->crc,
NULL, &op->crc,
src_len >> 9,
@@ -1072,6 +1086,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ if (write_corrupt_ratio) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+#endif
+
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
@@ -1104,15 +1126,8 @@ do_write:
*_dst = dst;
return more;
csum_err:
- {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = -EIO;
+ bch2_write_csum_err_msg(op);
+ ret = -BCH_ERR_data_write_csum;
err:
if (to_wbio(dst)->bounce)
bch2_bio_free_pages_pool(c, dst);
@@ -1190,39 +1205,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (ret) {
- op->error = ret;
+ if (ret)
break;
- }
}
bch2_trans_put(trans);
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
+ }
+
+ if (ret)
+ op->error = ret;
}
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
- op->error = -EIO;
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
+ op->error = -BCH_ERR_data_write_io;
+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1251,7 +1263,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct bucket_to_lock *stale_at;
int stale, ret;
- if (op->flags & BCH_WRITE_MOVE)
+ if (op->flags & BCH_WRITE_move)
return;
darray_init(&buckets);
@@ -1309,7 +1321,7 @@ retry:
}), GFP_KERNEL|__GFP_NOFAIL);
if (ptr->unwritten)
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ op->flags |= BCH_WRITE_convert_unwritten;
}
/* Unlock before taking nocow locks, doing IO: */
@@ -1317,7 +1329,7 @@ retry:
bch2_trans_unlock(trans);
bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ if (op->flags & BCH_WRITE_convert_unwritten)
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
@@ -1342,7 +1354,7 @@ retry:
wbio_init(bio)->put_bio = true;
bio->bi_opf = op->wbio.bio.bi_opf;
} else {
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
}
op->pos.offset += bio_sectors(bio);
@@ -1352,11 +1364,12 @@ retry:
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl);
+
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true);
bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_SUBMITTED)
+ if (op->flags & BCH_WRITE_submitted)
break;
bch2_btree_iter_advance(&iter);
}
@@ -1370,21 +1383,18 @@ err:
darray_exit(&buckets);
if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
op->error = ret;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
}
/* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_SUBMITTED)) {
+ if (!(op->flags & BCH_WRITE_submitted)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_SYNC) {
+ } else if (op->flags & BCH_WRITE_sync) {
closure_sync(&op->cl);
bch2_nocow_write_done(&op->cl.work);
} else {
@@ -1414,7 +1424,7 @@ err_bucket_stale:
"pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_data_write_invalid_ptr;
} else {
/* We can retry this: */
ret = -BCH_ERR_transaction_restart;
@@ -1436,7 +1446,7 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_SUBMITTED)
+ if (op->flags & BCH_WRITE_submitted)
goto out_nofs_restore;
}
again:
@@ -1466,7 +1476,7 @@ again:
ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
op->write_point,
&op->devs_have,
op->nr_replicas,
@@ -1489,16 +1499,12 @@ again:
bch2_alloc_sectors_done_inlined(c, wp);
err:
if (ret <= 0) {
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
if (unlikely(ret < 0)) {
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1524,14 +1530,14 @@ err:
* synchronously here if we weren't able to submit all of the IO at
* once, as that signals backpressure to the caller.
*/
- if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_SUBMITTED) &&
- !(op->flags & BCH_WRITE_IN_WORKER))) {
+ if ((op->flags & BCH_WRITE_sync) ||
+ (!(op->flags & BCH_WRITE_submitted) &&
+ !(op->flags & BCH_WRITE_in_worker))) {
bch2_wait_on_allocator(c, &op->cl);
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_SUBMITTED))
+ if (!(op->flags & BCH_WRITE_submitted))
goto again;
bch2_write_done(&op->cl);
} else {
@@ -1552,8 +1558,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
memset(&op->failed, 0, sizeof(op->failed));
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_wrote_data_inline;
+ op->flags |= BCH_WRITE_submitted;
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
@@ -1616,8 +1622,8 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+ if (op->flags & BCH_WRITE_only_specified_devs)
+ op->flags |= BCH_WRITE_alloc_nowait;
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
@@ -1625,11 +1631,8 @@ CLOSURE_CALLBACK(bch2_write)
wbio_init(bio)->put_bio = false;
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "misaligned write");
- printbuf_exit(&buf);
- op->error = -EIO;
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
+ op->error = -BCH_ERR_data_write_misaligned;
goto err;
}
@@ -1638,13 +1641,14 @@ CLOSURE_CALLBACK(bch2_write)
goto err;
}
- if (!(op->flags & BCH_WRITE_MOVE) &&
+ if (!(op->flags & BCH_WRITE_move) &&
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
op->error = -BCH_ERR_erofs_no_writes;
goto err;
}
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+ if (!(op->flags & BCH_WRITE_move))
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -1675,20 +1679,26 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
- prt_str(out, "pos: ");
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "started: ");
+ prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out);
- prt_str(out, "flags: ");
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
+
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
printbuf_indent_sub(out, 2);
}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index b4626013abc8..b8ab19a1e1da 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -11,33 +11,27 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
+__printf(3, 4)
+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
#define BCH_WRITE_FLAGS() \
- x(ALLOC_NOWAIT) \
- x(CACHED) \
- x(DATA_ENCODED) \
- x(PAGES_STABLE) \
- x(PAGES_OWNED) \
- x(ONLY_SPECIFIED_DEVS) \
- x(WROTE_DATA_INLINE) \
- x(FROM_INTERNAL) \
- x(CHECK_ENOSPC) \
- x(SYNC) \
- x(MOVE) \
- x(IN_WORKER) \
- x(SUBMITTED) \
- x(IO_ERROR) \
- x(CONVERT_UNWRITTEN)
+ x(alloc_nowait) \
+ x(cached) \
+ x(data_encoded) \
+ x(pages_stable) \
+ x(pages_owned) \
+ x(only_specified_devs) \
+ x(wrote_data_inline) \
+ x(check_enospc) \
+ x(sync) \
+ x(move) \
+ x(in_worker) \
+ x(submitted) \
+ x(io_error) \
+ x(convert_unwritten)
enum __bch_write_flags {
#define x(f) __BCH_WRITE_##f,
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index 6e878a6f2f0b..3ef6df9145ef 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -64,7 +64,7 @@ struct bch_write_op {
struct bpos pos;
struct bversion version;
- /* For BCH_WRITE_DATA_ENCODED: */
+ /* For BCH_WRITE_data_encoded: */
struct bch_extent_crc_unpacked crc;
struct write_point_specifier write_point;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 05b1250619ec..bfdaea6569ae 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -20,13 +20,6 @@
#include "journal_seq_blacklist.h"
#include "trace.h"
-static const char * const bch2_journal_errors[] = {
-#define x(n) #n,
- JOURNAL_ERRORS()
-#undef x
- NULL
-};
-
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -56,11 +49,18 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
+ if (!buf->write_started)
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
- prt_printf(out, "size:\t");
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
- prt_newline(out);
+ struct closure *cl = &buf->io;
+ int r = atomic_read(&cl->remaining);
+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
+
+ if (buf->data) {
+ prt_printf(out, "size:\t");
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+ }
prt_printf(out, "expires:\t");
prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
@@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
+ lockdep_assert_held(&j->lock);
+ out->atomic++;
+
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq++)
bch2_journal_buf_to_text(out, j, seq);
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
+
+ --out->atomic;
}
static inline struct journal_buf *
@@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq)
EBUG_ON(seq > journal_cur_seq(j));
- if (journal_seq_unwritten(j, seq)) {
+ if (journal_seq_unwritten(j, seq))
buf = j->buf + (seq & JOURNAL_BUF_MASK);
- EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
- }
return buf;
}
@@ -139,8 +142,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
bool stuck = false;
struct printbuf buf = PRINTBUF;
- if (!(error == JOURNAL_ERR_journal_full ||
- error == JOURNAL_ERR_journal_pin_full) ||
+ if (!(error == -BCH_ERR_journal_full ||
+ error == -BCH_ERR_journal_pin_full) ||
nr_unwritten_journal_entries(j) ||
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
return stuck;
@@ -167,7 +170,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
spin_unlock(&j->lock);
bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
- bch2_journal_errors[error]);
+ bch2_err_str(error));
bch2_journal_debug_to_text(&buf, j);
bch_err(c, "%s", buf.buf);
@@ -195,7 +198,8 @@ void bch2_journal_do_writes(struct journal *j)
if (w->write_started)
continue;
- if (!journal_state_count(j->reservations, idx)) {
+ if (!journal_state_seq_count(j, j->reservations, seq)) {
+ j->seq_write_started = seq;
w->write_started = true;
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
@@ -306,7 +310,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
bch2_journal_space_available(j);
- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
}
void bch2_journal_halt(struct journal *j)
@@ -377,29 +381,41 @@ static int journal_entry_open(struct journal *j)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return JOURNAL_ERR_blocked;
+ return -BCH_ERR_journal_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
- if (bch2_journal_error(j))
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ int ret = bch2_journal_error(j);
+ if (unlikely(ret))
+ return ret;
if (!fifo_free(&j->pin))
- return JOURNAL_ERR_journal_pin_full;
+ return -BCH_ERR_journal_pin_full;
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return JOURNAL_ERR_max_in_flight;
+ return -BCH_ERR_journal_max_in_flight;
+
+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
+ return -BCH_ERR_journal_max_open;
if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
bch_err(c, "cannot start: journal seq overflow");
if (bch2_fs_emergency_read_only_locked(c))
bch_err(c, "fatal error - emergency read only");
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ return -BCH_ERR_journal_shutdown;
}
+ if (!j->free_buf && !buf->data)
+ return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
+
BUG_ON(!j->cur_entry_sectors);
+ if (!buf->data) {
+ swap(buf->data, j->free_buf);
+ swap(buf->buf_size, j->free_buf_size);
+ }
+
buf->expires =
(journal_cur_seq(j) == j->flushed_seq_ondisk
? jiffies
@@ -415,7 +431,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return JOURNAL_ERR_journal_full;
+ return -BCH_ERR_journal_full;
if (fifo_empty(&j->pin) && j->reclaim_thread)
wake_up_process(j->reclaim_thread);
@@ -464,7 +480,7 @@ static int journal_entry_open(struct journal *j)
new.idx++;
BUG_ON(journal_state_count(new, new.idx));
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
journal_state_inc(&new);
@@ -514,6 +530,33 @@ static void journal_write_work(struct work_struct *work)
spin_unlock(&j->lock);
}
+static void journal_buf_prealloc(struct journal *j)
+{
+ if (j->free_buf &&
+ j->free_buf_size >= j->buf_size_want)
+ return;
+
+ unsigned buf_size = j->buf_size_want;
+
+ spin_unlock(&j->lock);
+ void *buf = kvmalloc(buf_size, GFP_NOFS);
+ spin_lock(&j->lock);
+
+ if (buf &&
+ (!j->free_buf ||
+ buf_size > j->free_buf_size)) {
+ swap(buf, j->free_buf);
+ swap(buf_size, j->free_buf_size);
+ }
+
+ if (unlikely(buf)) {
+ spin_unlock(&j->lock);
+ /* kvfree can sleep */
+ kvfree(buf);
+ spin_lock(&j->lock);
+ }
+}
+
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
@@ -525,25 +568,28 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
- if (bch2_journal_error(j))
- return -BCH_ERR_erofs_journal_err;
+ ret = bch2_journal_error(j);
+ if (unlikely(ret))
+ return ret;
if (j->blocked)
- return -BCH_ERR_journal_res_get_blocked;
+ return -BCH_ERR_journal_blocked;
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = JOURNAL_ERR_journal_full;
+ ret = -BCH_ERR_journal_full;
can_discard = j->can_discard;
goto out;
}
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = JOURNAL_ERR_max_in_flight;
+ ret = -BCH_ERR_journal_max_in_flight;
goto out;
}
spin_lock(&j->lock);
+ journal_buf_prealloc(j);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
@@ -566,25 +612,48 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
+ ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
out:
- if (ret == JOURNAL_ERR_retry)
- goto retry;
- if (!ret)
+ if (likely(!ret))
return 0;
+ if (ret == -BCH_ERR_journal_retry_open)
+ goto retry;
if (journal_error_check_stuck(j, ret, flags))
- ret = -BCH_ERR_journal_res_get_blocked;
+ ret = -BCH_ERR_journal_stuck;
- if (ret == JOURNAL_ERR_max_in_flight &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+ if (ret == -BCH_ERR_journal_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
+ trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_printbuf_make_room(&buf, 4096);
+
+ spin_lock(&j->lock);
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ spin_unlock(&j->lock);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
+ if (ret == -BCH_ERR_journal_max_open &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
+ trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
+
+ bch2_printbuf_make_room(&buf, 4096);
+
+ spin_lock(&j->lock);
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
+ spin_unlock(&j->lock);
+
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
@@ -594,8 +663,8 @@ out:
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
- if ((ret == JOURNAL_ERR_journal_full ||
- ret == JOURNAL_ERR_journal_pin_full) &&
+ if ((ret == -BCH_ERR_journal_full ||
+ ret == -BCH_ERR_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
@@ -608,9 +677,7 @@ out:
}
}
- return ret == JOURNAL_ERR_insufficient_devices
- ? -BCH_ERR_erofs_journal_err
- : -BCH_ERR_journal_res_get_blocked;
+ return ret;
}
static unsigned max_dev_latency(struct bch_fs *c)
@@ -640,7 +707,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
int ret;
if (closure_wait_event_timeout(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK),
HZ))
return ret;
@@ -654,7 +721,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
remaining_wait = max(0, remaining_wait - HZ);
if (closure_wait_event_timeout(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK),
remaining_wait))
return ret;
@@ -666,7 +733,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
printbuf_exit(&buf);
closure_wait_event(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
@@ -687,7 +754,6 @@ void bch2_journal_entry_res_resize(struct journal *j,
goto out;
j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
- smp_mb();
state = READ_ONCE(j->reservations);
if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
@@ -907,7 +973,7 @@ int bch2_journal_meta(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
- return -EROFS;
+ return -BCH_ERR_erofs_no_writes;
int ret = __bch2_journal_meta(j);
bch2_write_ref_put(c, BCH_WRITE_REF_journal);
@@ -951,7 +1017,8 @@ static void __bch2_journal_block(struct journal *j)
new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
}
}
@@ -992,7 +1059,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
*blocked = true;
}
- ret = journal_state_count(s, idx) > open
+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
? ERR_PTR(-EAGAIN)
: buf;
break;
@@ -1349,6 +1416,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_write_started = cur_seq - 1;
j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
@@ -1389,8 +1457,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
- j->reservations.unwritten_idx++;
+ j->reservations.idx = journal_cur_seq(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
@@ -1443,7 +1510,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
return -BCH_ERR_ENOMEM_dev_journal_init;
@@ -1482,6 +1549,7 @@ void bch2_fs_journal_exit(struct journal *j)
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
kvfree(j->buf[i].data);
+ kvfree(j->free_buf);
free_fifo(&j->pin);
}
@@ -1508,13 +1576,13 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
- if (!j->buf[i].data)
- return -BCH_ERR_ENOMEM_journal_buf;
+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
+ if (!j->free_buf)
+ return -BCH_ERR_ENOMEM_journal_buf;
+
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
j->buf[i].idx = i;
- }
j->pin.front = j->pin.back = 1;
@@ -1564,6 +1632,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "average write size:\t");
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
prt_newline(out);
+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
@@ -1571,7 +1640,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
prt_printf(out, "blocked:\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
prt_printf(out, "current entry:\t");
switch (s.cur_entry_offset) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 107f7f901cd9..47828771f9c2 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j)
closure_wake_up(&j->async_wait);
}
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- return j->buf + j->reservations.idx;
-}
-
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
return j->seq_ondisk + 1;
}
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+ unsigned idx = (journal_cur_seq(j) &
+ JOURNAL_BUF_MASK &
+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
+
+ return j->buf + idx;
+}
+
static inline int journal_state_count(union journal_res_state s, int idx)
{
switch (idx) {
@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx)
BUG();
}
+static inline int journal_state_seq_count(struct journal *j,
+ union journal_res_state s, u64 seq)
+{
+ if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
+ else
+ return 0;
+}
+
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
static inline struct jset_entry *
journal_res_entry(struct journal *j, struct journal_res *res)
{
- return vstruct_idx(j->buf[res->idx].data, res->offset);
+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
}
static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *);
void bch2_journal_do_writes(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64);
-static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
{
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
bch2_journal_buf_put_final(j, seq);
}
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
{
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j,
BCH_JSET_ENTRY_btree_keys,
0, 0, 0);
- bch2_journal_buf_put(j, res->idx, res->seq);
+ bch2_journal_buf_put(j, res->seq);
res->ref = 0;
}
@@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j,
/*
* Check if there is still room in the current journal
- * entry:
+ * entry, smp_rmb() guarantees that reads from reservations.counter
+ * occur before accessing cur_entry_u64s:
*/
+ smp_rmb();
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
@@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j,
&old.v, new.v));
res->ref = true;
- res->idx = old.idx;
res->offset = old.cur_entry_offset;
- res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
+ res->seq = journal_cur_seq(j);
+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
return 1;
}
@@ -390,6 +407,7 @@ out:
(flags & JOURNAL_RES_GET_NONBLOCK) != 0,
NULL, _THIS_IP_);
EBUG_ON(!res->ref);
+ BUG_ON(!res->seq);
}
return 0;
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 11c39e0c34f4..4ed6137f0439 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1041,13 +1041,19 @@ reread:
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
+ u64 submit_time = local_clock();
ret = submit_bio_wait(bio);
kfree(bio);
- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
- "journal read error: sector %llu",
- offset) ||
- bch2_meta_read_fault("journal")) {
+ if (!ret && bch2_meta_read_fault("journal"))
+ ret = -BCH_ERR_EIO_fault_injected;
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ submit_time, !ret);
+
+ if (ret) {
+ bch_err_dev_ratelimited(ca,
+ "journal read error: sector %llu", offset);
/*
* We don't error out of the recovery process
* here, since the relevant journal entry may be
@@ -1110,13 +1116,16 @@ reread:
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf)))
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good) {
+ bch_err_dev_ratelimited(ca, "%s",
+ (printbuf_reset(&err),
+ prt_str(&err, "journal "),
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+ err.buf));
saw_bad = true;
+ }
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
@@ -1515,7 +1524,7 @@ static void __journal_write_alloc(struct journal *j,
* @j: journal object
* @w: journal buf (entry to be written)
*
- * Returns: 0 on success, or -EROFS on failure
+ * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
*/
static int journal_write_alloc(struct journal *j, struct journal_buf *w)
{
@@ -1600,18 +1609,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
kvfree(new_buf);
}
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
-{
- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-}
-
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas;
- union journal_res_state old, new;
u64 seq = le64_to_cpu(w->data->seq);
int err = 0;
@@ -1621,12 +1624,11 @@ static CLOSURE_CALLBACK(journal_write_done)
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
- err = -EIO;
+ err = -BCH_ERR_journal_write_err;
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
- if (bch2_mark_replicas(c, &replicas.e))
- err = -EIO;
+ err = bch2_mark_replicas(c, &replicas.e);
}
if (err)
@@ -1641,7 +1643,23 @@ static CLOSURE_CALLBACK(journal_write_done)
j->err_seq = seq;
w->write_done = true;
+ if (!j->free_buf || j->free_buf_size < w->buf_size) {
+ swap(j->free_buf, w->data);
+ swap(j->free_buf_size, w->buf_size);
+ }
+
+ if (w->data) {
+ void *buf = w->data;
+ w->data = NULL;
+ w->buf_size = 0;
+
+ spin_unlock(&j->lock);
+ kvfree(buf);
+ spin_lock(&j->lock);
+ }
+
bool completed = false;
+ bool do_discards = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
@@ -1650,11 +1668,10 @@ static CLOSURE_CALLBACK(journal_write_done)
if (!w->write_done)
break;
- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
+ if (!j->err_seq && !w->noflush) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
- bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
@@ -1671,16 +1688,6 @@ static CLOSURE_CALLBACK(journal_write_done)
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
-
- new.unwritten_idx++;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
closure_wake_up(&w->wait);
completed = true;
}
@@ -1695,7 +1702,7 @@ static CLOSURE_CALLBACK(journal_write_done)
}
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1715,6 +1722,9 @@ static CLOSURE_CALLBACK(journal_write_done)
*/
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
+
+ if (do_discards)
+ bch2_do_discards(c);
}
static void journal_write_endio(struct bio *bio)
@@ -1724,13 +1734,16 @@ static void journal_write_endio(struct bio *bio)
struct journal *j = &ca->fs->journal;
struct journal_buf *w = j->buf + jbio->buf_idx;
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ jbio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("journal")) {
- unsigned long flags;
+ bch2_blk_status_to_str(bio->bi_status));
+ unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
@@ -1759,7 +1772,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
sectors);
struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
+ struct journal_bio *jbio = ja->bio[w->idx];
+ struct bio *bio = &jbio->bio;
+
+ jbio->submit_time = local_clock();
+
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1791,6 +1808,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ /*
+ * Wait for previous journal writes to comelete; they won't necessarily
+ * be flushed if they're still in flight
+ */
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
@@ -1984,7 +2005,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* write anything at all.
*/
if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
- return -EIO;
+ return error;
if (error ||
w->noflush ||
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index d373cd181a7f..5d1547aa118a 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -226,7 +226,7 @@ void bch2_journal_space_available(struct journal *j)
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = JOURNAL_ERR_insufficient_devices;
+ ret = -BCH_ERR_insufficient_journal_devices;
goto out;
}
@@ -240,7 +240,7 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
- ret = JOURNAL_ERR_journal_full;
+ ret = -BCH_ERR_journal_full;
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -645,7 +645,6 @@ static u64 journal_seq_to_flush(struct journal *j)
* @j: journal object
* @direct: direct or background reclaim?
* @kicked: requested to run since we last ran?
- * Returns: 0 on success, or -EIO if the journal has been shutdown
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
@@ -685,10 +684,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (kthread && kthread_should_stop())
break;
- if (bch2_journal_error(j)) {
- ret = -EIO;
+ ret = bch2_journal_error(j);
+ if (ret)
break;
- }
bch2_journal_do_discards(j);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 1f25c111c54c..e463d2d95359 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -231,15 +231,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
- unsigned i;
- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
- src < bl->start + nr;
- src++, i = eytzinger0_next(i, nr)) {
+ src = bl->start;
+ eytzinger0_for_each(i, nr) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
+ src++;
}
unsigned new_nr = dst - bl->start;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 1ef3a28ed6ab..8e0eba776b9d 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -12,7 +12,11 @@
/* btree write buffer steals 8 bits for its own purposes: */
#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
-#define JOURNAL_BUF_BITS 2
+#define JOURNAL_STATE_BUF_BITS 2
+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
+
+#define JOURNAL_BUF_BITS 4
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
@@ -82,7 +86,6 @@ struct journal_entry_pin {
struct journal_res {
bool ref;
- u8 idx;
u16 u64s;
u32 offset;
u64 seq;
@@ -98,9 +101,8 @@ union journal_res_state {
};
struct {
- u64 cur_entry_offset:20,
+ u64 cur_entry_offset:22,
idx:2,
- unwritten_idx:2,
buf0_count:10,
buf1_count:10,
buf2_count:10,
@@ -110,13 +112,13 @@ union journal_res_state {
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
/*
* We stash some journal state as sentinal values in cur_entry_offset:
* note - cur_entry_offset is in units of u64s
*/
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
@@ -149,28 +151,12 @@ enum journal_flags {
#undef x
};
-/* Reasons we may fail to get a journal reservation: */
-#define JOURNAL_ERRORS() \
- x(ok) \
- x(retry) \
- x(blocked) \
- x(max_in_flight) \
- x(journal_full) \
- x(journal_pin_full) \
- x(journal_stuck) \
- x(insufficient_devices)
-
-enum journal_errors {
-#define x(n) JOURNAL_ERR_##n,
- JOURNAL_ERRORS()
-#undef x
-};
-
typedef DARRAY(u64) darray_u64;
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
+ u64 submit_time;
struct bio bio;
};
@@ -199,7 +185,7 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- enum journal_errors cur_entry_error;
+ int cur_entry_error;
unsigned cur_entry_offset_if_blocked;
unsigned buf_size_want;
@@ -220,6 +206,8 @@ struct journal {
* other is possibly being written out.
*/
struct journal_buf buf[JOURNAL_BUF_NR];
+ void *free_buf;
+ unsigned free_buf_size;
spinlock_t lock;
@@ -237,6 +225,7 @@ struct journal {
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
+ u64 seq_write_started;
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index ce794d55818f..a299d9ec8ee4 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
+#include "ec.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
@@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
}
-int bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
+int __bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
{
if (old_time == new_time)
return 0;
@@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = {
};
int bch2_lru_check_set(struct btree_trans *trans,
- u16 lru_id, u64 time,
+ u16 lru_id,
+ u64 dev_bucket,
+ u64 time,
struct bkey_s_c referring_k,
struct bkey_buf *last_flushed)
{
@@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
struct btree_iter lru_iter;
struct bkey_s_c lru_k =
bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(lru_id,
- bucket_to_u64(referring_k.k->p),
- time), 0);
+ lru_pos(lru_id, dev_bucket, time), 0);
int ret = bkey_err(lru_k);
if (ret)
return ret;
@@ -104,7 +105,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
" %s",
bch2_lru_types[lru_type(lru_k)],
(bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
- ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+ ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
if (ret)
goto err;
}
@@ -116,49 +117,73 @@ fsck_err:
return ret;
}
+static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
+{
+ enum bch_lru_type type = lru_type(lru_k);
+
+ switch (type) {
+ case BCH_LRU_read:
+ case BCH_LRU_fragmentation:
+ return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
+ case BCH_LRU_stripes:
+ return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
+ default:
+ BUG();
+ }
+}
+
+static u64 bkey_lru_type_idx(struct bch_fs *c,
+ enum bch_lru_type type,
+ struct bkey_s_c k)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+
+ switch (type) {
+ case BCH_LRU_read:
+ a = bch2_alloc_to_v4(k, &a_convert);
+ return alloc_lru_idx_read(*a);
+ case BCH_LRU_fragmentation: {
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
+ u64 idx = ca
+ ? alloc_lru_idx_fragmentation(*a, ca)
+ : 0;
+ rcu_read_unlock();
+ return idx;
+ }
+ case BCH_LRU_stripes:
+ return k.k->type == KEY_TYPE_stripe
+ ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
+ : 0;
+ default:
+ BUG();
+ }
+}
+
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
- enum bch_lru_type type = lru_type(lru_k);
- struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
- u64 idx;
- int ret;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
- if (fsck_err_on(!ca,
- trans, lru_entry_to_invalid_bucket,
- "lru key points to nonexistent device:bucket %llu:%llu",
- alloc_pos.inode, alloc_pos.offset))
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+ struct bbpos bp = lru_pos_to_bp(lru_k);
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
+ int ret = bkey_err(k);
if (ret)
goto err;
- a = bch2_alloc_to_v4(k, &a_convert);
-
- switch (type) {
- case BCH_LRU_read:
- idx = alloc_lru_idx_read(*a);
- break;
- case BCH_LRU_fragmentation:
- idx = alloc_lru_idx_fragmentation(*a, ca);
- break;
- }
+ enum bch_lru_type type = lru_type(lru_k);
+ u64 idx = bkey_lru_type_idx(c, type, k);
- if (lru_k.k->type != KEY_TYPE_set ||
- lru_pos_time(lru_k.k->p) != idx) {
+ if (lru_pos_time(lru_k.k->p) != idx) {
ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
if (ret)
goto err;
@@ -176,7 +201,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
err:
fsck_err:
bch2_trans_iter_exit(trans, &iter);
- bch2_dev_put(ca);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index f31a6cf1514c..8abd0aa2083a 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
- if (lru_id == BCH_LRU_FRAGMENTATION_START)
+ switch (lru_id) {
+ case BCH_LRU_BUCKET_FRAGMENTATION:
return BCH_LRU_fragmentation;
- return BCH_LRU_read;
+ case BCH_LRU_STRIPE_FRAGMENTATION:
+ return BCH_LRU_stripes;
+ default:
+ return BCH_LRU_read;
+ }
}
int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
@@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+static inline int bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
+{
+ return old_time != new_time
+ ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
+ : 0;
+}
struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
int bch2_check_lrus(struct bch_fs *);
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
index f372cb3b8cda..b7392ad8e41f 100644
--- a/fs/bcachefs/lru_format.h
+++ b/fs/bcachefs/lru_format.h
@@ -9,7 +9,8 @@ struct bch_lru {
#define BCH_LRU_TYPES() \
x(read) \
- x(fragmentation)
+ x(fragmentation) \
+ x(stripes)
enum bch_lru_type {
#define x(n) BCH_LRU_##n,
@@ -17,7 +18,8 @@ enum bch_lru_type {
#undef x
};
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
+#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ddc187fb693d..57ad662871ba 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -15,6 +15,7 @@
#include "keylist.h"
#include "migrate.h"
#include "move.h"
+#include "progress.h"
#include "replicas.h"
#include "super-io.h"
@@ -76,7 +77,9 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return 0;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int bch2_dev_usrdata_drop(struct bch_fs *c,
+ struct progress_indicator_state *progress,
+ unsigned dev_idx, int flags)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id id;
@@ -88,8 +91,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
+ }));
if (ret)
break;
}
@@ -99,7 +104,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
return ret;
}
-static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int bch2_dev_metadata_drop(struct bch_fs *c,
+ struct progress_indicator_state *progress,
+ unsigned dev_idx, int flags)
{
struct btree_trans *trans;
struct btree_iter iter;
@@ -125,6 +132,8 @@ retry:
while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
+
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
@@ -169,6 +178,11 @@ err:
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
- bch2_dev_metadata_drop(c, dev_idx, flags);
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c,
+ BIT_ULL(BTREE_ID_extents)|
+ BIT_ULL(BTREE_ID_reflink));
+
+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 160b4374160a..8fcdc6984f6e 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- if (trace_move_extent_enabled()) {
+ if (trace_io_move_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_move_extent(c, buf.buf);
+ trace_io_move(c, buf.buf);
printbuf_exit(&buf);
}
}
-static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
- if (trace_move_extent_read_enabled()) {
+ if (trace_io_move_read_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
- trace_move_extent_read(c, buf.buf);
+ trace_io_move_read(c, buf.buf);
printbuf_exit(&buf);
}
}
@@ -74,11 +74,7 @@ struct moving_io {
unsigned read_sectors;
unsigned write_sectors;
- struct bch_read_bio rbio;
-
struct data_update write;
- /* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[];
};
static void move_free(struct moving_io *io)
@@ -88,43 +84,72 @@ static void move_free(struct moving_io *io)
if (io->b)
atomic_dec(&io->b->count);
- bch2_data_update_exit(&io->write);
-
mutex_lock(&ctxt->lock);
list_del(&io->io_list);
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
+ if (!io->write.data_opts.scrub) {
+ bch2_data_update_exit(&io->write);
+ } else {
+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
+ kfree(io->write.bvecs);
+ }
kfree(io);
}
static void move_write_done(struct bch_write_op *op)
{
struct moving_io *io = container_of(op, struct moving_io, write.op);
+ struct bch_fs *c = op->c;
struct moving_context *ctxt = io->write.ctxt;
- if (io->write.op.error)
+ if (op->error) {
+ if (trace_io_move_write_fail_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_write_op_to_text(&buf, op);
+ prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
+ trace_io_move_write_fail(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
+
ctxt->write_error = true;
+ }
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
- atomic_dec(&io->write.ctxt->write_ios);
+ atomic_sub(io->write_sectors, &ctxt->write_sectors);
+ atomic_dec(&ctxt->write_ios);
move_free(io);
closure_put(&ctxt->cl);
}
static void move_write(struct moving_io *io)
{
- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+ struct moving_context *ctxt = io->write.ctxt;
+
+ if (ctxt->stats) {
+ if (io->write.rbio.bio.bi_status)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_uncorrected);
+ else if (io->write.rbio.saw_error)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_corrected);
+ }
+
+ if (unlikely(io->write.rbio.ret ||
+ io->write.rbio.bio.bi_status ||
+ io->write.data_opts.scrub)) {
move_free(io);
return;
}
- if (trace_move_extent_write_enabled()) {
+ if (trace_io_move_write_enabled()) {
struct bch_fs *c = io->write.op.c;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
- trace_move_extent_write(c, buf.buf);
+ trace_io_move_write(c, buf.buf);
printbuf_exit(&buf);
}
@@ -132,7 +157,7 @@ static void move_write(struct moving_io *io)
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
- bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
+ bch2_data_update_read_done(&io->write);
}
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
@@ -145,7 +170,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx
static void move_read_endio(struct bio *bio)
{
- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
atomic_sub(io->read_sectors, &ctxt->read_sectors);
@@ -258,14 +283,10 @@ int bch2_move_extent(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct moving_io *io;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
- trace_move_extent2(c, k, &io_opts, &data_opts);
+ trace_io_move2(c, k, &io_opts, &data_opts);
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
@@ -273,7 +294,8 @@ int bch2_move_extent(struct moving_context *ctxt,
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
- !data_opts.extra_replicas) {
+ !data_opts.extra_replicas &&
+ !data_opts.scrub) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
@@ -285,13 +307,7 @@ int bch2_move_extent(struct moving_context *ctxt,
*/
bch2_trans_unlock(trans);
- /* write path might have to decompress data: */
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
-
- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
- io = kzalloc(sizeof(struct moving_io) +
- sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
if (!io)
goto err;
@@ -300,31 +316,27 @@ int bch2_move_extent(struct moving_context *ctxt,
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
- io->write.op.wbio.bio.bi_ioprio =
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
-
- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
- GFP_KERNEL))
- goto err_free;
+ if (!data_opts.scrub) {
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+ &io_opts, data_opts, iter->btree_id, k);
+ if (ret)
+ goto err_free;
- io->rbio.c = c;
- io->rbio.opts = io_opts;
- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
- io->rbio.bio.bi_vcnt = pages;
- io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
- io->rbio.bio.bi_iter.bi_size = sectors << 9;
+ io->write.op.end_io = move_write_done;
+ } else {
+ bch2_bkey_buf_init(&io->write.k);
+ bch2_bkey_buf_reassemble(&io->write.k, c, k);
- io->rbio.bio.bi_opf = REQ_OP_READ;
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
- io->rbio.bio.bi_end_io = move_read_endio;
+ io->write.op.c = c;
+ io->write.data_opts = data_opts;
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, iter->btree_id, k);
- if (ret)
- goto err_free_pages;
+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
+ if (ret)
+ goto err_free;
+ }
- io->write.op.end_io = move_write_done;
+ io->write.rbio.bio.bi_end_io = move_read_endio;
+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
@@ -339,9 +351,7 @@ int bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&io->b->count);
}
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
- trace_move_extent_read2(c, k);
+ trace_io_move_read2(c, k);
mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -356,33 +366,33 @@ int bch2_move_extent(struct moving_context *ctxt,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(trans, &io->rbio,
- bkey_start_pos(k.k),
- iter->btree_id, k, 0,
- BCH_READ_NODECODE|
- BCH_READ_LAST_FRAGMENT);
+ __bch2_read_extent(trans, &io->write.rbio,
+ io->write.rbio.bio.bi_iter,
+ bkey_start_pos(k.k),
+ iter->btree_id, k, 0,
+ NULL,
+ BCH_READ_last_fragment,
+ data_opts.scrub ? data_opts.read_dev : -1);
return 0;
-err_free_pages:
- bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
- if (ret == -BCH_ERR_data_update_done)
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
return 0;
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- count_event(c, move_extent_start_fail);
+ count_event(c, io_move_start_fail);
- if (trace_move_extent_start_fail_enabled()) {
+ if (trace_io_move_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, ": ");
prt_str(&buf, bch2_err_str(ret));
- trace_move_extent_start_fail(c, buf.buf);
+ trace_io_move_start_fail(c, buf.buf);
printbuf_exit(&buf);
}
return ret;
@@ -551,6 +561,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
BTREE_ITER_prefetch|
+ BTREE_ITER_not_extents|
BTREE_ITER_all_snapshots);
if (ctxt->rate)
@@ -581,7 +592,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+ s64 offset_into_extent = 0;
bch2_trans_iter_exit(trans, &reflink_iter);
k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
@@ -600,6 +611,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
* pointer - need to fixup iter->k
*/
extent_iter = &reflink_iter;
+ offset_into_extent = 0;
}
if (!bkey_extent_is_direct_data(k.k))
@@ -627,7 +639,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
- if (ret2 == -ENOMEM) {
+ if (bch2_err_matches(ret2, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
continue;
@@ -689,21 +701,22 @@ int bch2_move_data(struct bch_fs *c,
bool wait_on_copygc,
move_pred_fn pred, void *arg)
{
-
struct moving_context ctxt;
- int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_move_data(&ctxt, start, end, pred, arg);
+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int bch2_evacuate_bucket(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bpos bucket, int gen,
- struct data_update_opts _data_opts)
+static int __bch2_move_data_phys(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ unsigned dev,
+ u64 bucket_start,
+ u64 bucket_end,
+ unsigned data_types,
+ move_pred_fn pred, void *arg)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -712,16 +725,19 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
struct bkey_s_c k;
- struct data_update_opts data_opts;
- unsigned sectors_moved = 0;
struct bkey_buf last_flushed;
int ret = 0;
- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ struct bch_dev *ca = bch2_dev_tryget(c, dev);
if (!ca)
return 0;
- trace_bucket_evacuate(c, &bucket);
+ bucket_end = min(bucket_end, ca->mi.nbuckets);
+
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
+ bch2_dev_put(ca);
+ ca = NULL;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -732,8 +748,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
*/
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket), 0);
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
bch_err_msg(c, ret, "looking up alloc key");
if (ret)
@@ -757,7 +772,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (ret)
goto err;
- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
+ if (!k.k || bkey_gt(k.k->p, bp_end))
break;
if (k.k->type != KEY_TYPE_backpointer)
@@ -765,107 +780,148 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- if (!bp.v->level) {
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!k.k)
- goto next;
+ if (ctxt->stats)
+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ if (!(data_types & BIT(bp.v->data_type)))
+ goto next;
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
+ if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
+ goto next;
+
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!k.k)
+ goto next;
+ if (!bp.v->level) {
ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
}
+ }
- data_opts = _data_opts;
- data_opts.target = io_opts.background_target;
- data_opts.rewrite_ptrs = 0;
-
- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
- unsigned i = 0;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
- if (p.ptr.dev == bucket.inode) {
- if (p.ptr.cached) {
- bch2_trans_iter_exit(trans, &iter);
- goto next;
- }
- data_opts.rewrite_ptrs |= 1U << i;
- break;
- }
- i++;
- }
-
- ret = bch2_move_extent(ctxt, bucket_in_flight,
- &iter, k, io_opts, data_opts);
+ struct data_update_opts data_opts = {};
+ if (!pred(c, arg, k, &io_opts, &data_opts)) {
bch2_trans_iter_exit(trans, &iter);
+ goto next;
+ }
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret == -ENOMEM) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
- if (ret)
- goto err;
-
- if (ctxt->stats)
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
- sectors_moved += sectors;
- } else {
- struct btree *b;
+ if (data_opts.scrub &&
+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
+ bch2_trans_iter_exit(trans, &iter);
+ ret = -BCH_ERR_device_offline;
+ break;
+ }
- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- goto next;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!b)
- goto next;
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+ /* move_extent will drop locks */
+ unsigned sectors = bp.v->bucket_len;
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
- bch2_trans_iter_exit(trans, &iter);
+ if (!bp.v->level)
+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
+ else if (!data_opts.scrub)
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+ else
+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
+ bch2_trans_iter_exit(trans, &iter);
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, sectors);
- if (ctxt->stats) {
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
- atomic64_add(sectors, &ctxt->stats->sectors_moved);
- }
- sectors_moved += btree_sectors(c);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ continue;
}
+ if (ret)
+ goto err;
+
+ if (ctxt->stats)
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
next:
bch2_btree_iter_advance(&bp_iter);
}
-
- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
err:
bch2_trans_iter_exit(trans, &bp_iter);
- bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
+static int bch2_move_data_phys(struct bch_fs *c,
+ unsigned dev,
+ u64 start,
+ u64 end,
+ unsigned data_types,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+ struct moving_context ctxt;
+
+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ctxt.stats->phys = true;
+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
+
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return ret;
+}
+
+struct evacuate_bucket_arg {
+ struct bpos bucket;
+ int gen;
+ struct data_update_opts data_opts;
+};
+
+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct evacuate_bucket_arg *arg = _arg;
+
+ *data_opts = arg->data_opts;
+
+ unsigned i = 0;
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (ptr->dev == arg->bucket.inode &&
+ (arg->gen < 0 || arg->gen == ptr->gen) &&
+ !ptr->cached)
+ data_opts->rewrite_ptrs |= BIT(i);
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;
+}
+
+int bch2_evacuate_bucket(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts)
+{
+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
+
+ return __bch2_move_data_phys(ctxt, bucket_in_flight,
+ bucket.inode,
+ bucket.offset,
+ bucket.offset + 1,
+ ~0,
+ evacuate_bucket_pred, &arg);
+}
+
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct btree *, struct bch_io_opts *,
struct data_update_opts *);
@@ -1007,14 +1063,6 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
-static bool migrate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
/*
* Ancient versions of bcachefs produced packed formats which could represent
* keys that the in memory format cannot represent; this checks for those
@@ -1104,6 +1152,30 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
+static bool scrub_pred(struct bch_fs *c, void *_arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct bch_ioctl_data *arg = _arg;
+
+ if (k.k->type != KEY_TYPE_btree_ptr_v2) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == arg->migrate.dev) {
+ if (!p.crc.csum_type)
+ return false;
+ break;
+ }
+ }
+
+ data_opts->scrub = true;
+ data_opts->read_dev = arg->migrate.dev;
+ return true;
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
@@ -1118,6 +1190,22 @@ int bch2_data_job(struct bch_fs *c,
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
switch (op.op) {
+ case BCH_DATA_OP_scrub:
+ /*
+ * prevent tests from spuriously failing, make sure we see all
+ * btree nodes that need to be repaired
+ */
+ bch2_btree_interior_updates_flush(c);
+
+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
+ op.scrub.data_types,
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ false,
+ scrub_pred, &op) ?: ret;
+ break;
+
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@@ -1137,14 +1225,14 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_btree(c, start, end,
- migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- migrate_pred, &op) ?: ret;
+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
+ ~0,
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ migrate_pred, &op) ?: ret;
+ bch2_btree_interior_updates_flush(c);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_rewrite_old_nodes:
@@ -1176,17 +1264,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
- prt_printf(out, "bytes seen: ");
+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_printf(out, "bytes moved: ");
+ prt_printf(out, "bytes moved:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_printf(out, "bytes raced: ");
+ prt_printf(out, "bytes raced:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1195,7 +1283,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
- struct moving_io *io;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
@@ -1215,8 +1304,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock);
+ struct moving_io *io;
list_for_each_entry(io, &ctxt->ios, io_list)
- bch2_write_op_to_text(out, &io->write.op);
+ bch2_data_update_inflight_to_text(out, &io->write);
mutex_unlock(&ctxt->lock);
printbuf_indent_sub(out, 4);
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index e22841ef31e4..807f779f6f76 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -3,22 +3,36 @@
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
+#include "bcachefs_ioctl.h"
struct bch_move_stats {
- enum bch_data_type data_type;
- struct bbpos pos;
char name[32];
+ bool phys;
+ enum bch_ioctl_data_event_ret ret;
+
+ union {
+ struct {
+ enum bch_data_type data_type;
+ struct bbpos pos;
+ };
+ struct {
+ unsigned dev;
+ u64 offset;
+ };
+ };
atomic64_t keys_moved;
atomic64_t keys_raced;
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
+ atomic64_t sectors_error_corrected;
+ atomic64_t sectors_error_uncorrected;
};
struct move_bucket_key {
struct bpos bucket;
- u8 gen;
+ unsigned gen;
};
struct move_bucket {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 6718dc37c5a3..5126c870ce5b 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -167,8 +167,8 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
bch2_trans_begin(trans);
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
- lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
0, k, ({
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
int ret2 = 0;
@@ -317,6 +317,17 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out);
+
+ rcu_read_lock();
+ struct task_struct *t = rcu_dereference(c->copygc_thread);
+ if (t)
+ get_task_struct(t);
+ rcu_read_unlock();
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
}
static int bch2_copygc_thread(void *arg)
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c
index 2c3d46ac70c6..93246ad31541 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/namei.c
@@ -4,8 +4,8 @@
#include "acl.h"
#include "btree_update.h"
#include "dirent.h"
-#include "fs-common.h"
#include "inode.h"
+#include "namei.h"
#include "subvolume.h"
#include "xattr.h"
@@ -47,6 +47,10 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
+ /* Inherit casefold state from parent. */
+ if (S_ISDIR(mode))
+ new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded;
+
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
@@ -153,16 +157,14 @@ int bch2_create_trans(struct btree_trans *trans,
dir_u->bi_nlink++;
dir_u->bi_mtime = dir_u->bi_ctime = now;
- ret = bch2_inode_write(trans, &dir_iter, dir_u);
- if (ret)
- goto err;
-
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- dir_type,
- name,
- dir_target,
- &dir_offset,
- STR_HASH_must_create|BTREE_ITER_with_updates);
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
+ &dir_offset,
+ &dir_u->bi_size,
+ STR_HASH_must_create|BTREE_ITER_with_updates) ?:
+ bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
@@ -225,7 +227,9 @@ int bch2_link_trans(struct btree_trans *trans,
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
- name, inum.inum, &dir_offset,
+ name, inum.inum,
+ &dir_offset,
+ &dir_u->bi_size,
STR_HASH_must_create);
if (ret)
goto err;
@@ -417,8 +421,8 @@ int bch2_rename_trans(struct btree_trans *trans,
}
ret = bch2_dirent_rename(trans,
- src_dir, &src_hash,
- dst_dir, &dst_hash,
+ src_dir, &src_hash, &src_dir_u->bi_size,
+ dst_dir, &dst_hash, &dst_dir_u->bi_size,
src_name, &src_inum, &src_offset,
dst_name, &dst_inum, &dst_offset,
mode);
@@ -560,6 +564,8 @@ err:
return ret;
}
+/* inum_to_path */
+
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
{
bch2_printbuf_make_room(out, n);
@@ -650,3 +656,179 @@ disconnected:
prt_str_reversed(path, "(disconnected)");
goto out;
}
+
+/* fsck */
+
+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
+ int ret = 0;
+
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ fsck_err_on(S_ISDIR(target->bi_mode),
+ trans, inode_dir_missing_backpointer,
+ "directory with missing backpointer\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ return __bch2_fsck_write_inode(trans, target);
+ }
+
+ if (bch2_inode_should_have_single_bp(target) &&
+ !fsck_err(trans, inode_wrong_backpointer,
+ "dirent points to inode that does not point back:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n "),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf)))
+ goto err;
+
+ struct bkey_s_c_dirent bp_dirent =
+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
+ 0, dirent);
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (!backpointer_exists) {
+ if (fsck_err(trans, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target->bi_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target);
+ }
+ } else {
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
+ /*
+ * XXX: verify connectivity of the other dirent
+ * up to the root before removing this one
+ *
+ * Additionally, bch2_lookup would need to cope with the
+ * dirent it found being removed - or should we remove
+ * the other one, even though the inode points to it?
+ */
+ if (in_fsck) {
+ if (fsck_err(trans, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf))
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
+ } else {
+ bch2_fs_inconsistent(c,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf);
+ }
+
+ goto out;
+ } else {
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(!target->bi_nlink,
+ trans, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int __bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ trans, dirent_d_type_wrong,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
+
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h
index 2b59210bb5e8..2e6f6364767f 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/namei.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_COMMON_H
-#define _BCACHEFS_FS_COMMON_H
+#ifndef _BCACHEFS_NAMEI_H
+#define _BCACHEFS_NAMEI_H
#include "dirent.h"
@@ -44,4 +44,29 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
-#endif /* _BCACHEFS_FS_COMMON_H */
+int __bch2_check_dirent_target(struct btree_trans *,
+ struct btree_iter *,
+ struct bkey_s_c_dirent,
+ struct bch_inode_unpacked *, bool);
+
+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static inline int bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ if (likely(inode_points_to_dirent(target, d) &&
+ d.v->d_type == inode_d_type(target)))
+ return 0;
+
+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
+}
+
+#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 6772faf385a5..81fd6b7977d3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -163,16 +163,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_SUBVOL] = "subvol",
};
-u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
-{
- BUG();
-}
-
-void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
-{
- BUG();
-}
-
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
@@ -223,6 +213,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
}
}
+/* dummy option, for options that aren't stored in the superblock */
+typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
+typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
+typedef u64 (*member_opt_get_fn)(const struct bch_member *);
+typedef void (*member_opt_set_fn)(struct bch_member *, u64);
+
+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
+
+#define type_compatible_or_null(_p, _type) \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
+
const struct bch_option bch2_opt_table[] = {
#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
@@ -239,15 +244,15 @@ const struct bch_option bch2_opt_table[] = {
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
[Opt_##_name] = { \
- .attr = { \
- .name = #_name, \
- .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
- }, \
- .flags = _flags, \
- .hint = _hint, \
- .help = _help, \
- .get_sb = _sb_opt, \
- .set_sb = SET_##_sb_opt, \
+ .attr.name = #_name, \
+ .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
+ .flags = _flags, \
+ .hint = _hint, \
+ .help = _help, \
+ .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
+ .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
+ .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
+ .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
_type \
},
@@ -475,11 +480,18 @@ void bch2_opts_to_text(struct printbuf *out,
}
}
-int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v)
{
+ lockdep_assert_held(&c->state_lock);
+
int ret = 0;
switch (id) {
+ case Opt_state:
+ if (ca)
+ return __bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
+ break;
+
case Opt_compression:
case Opt_background_compression:
ret = bch2_check_set_has_compressed_data(c, v);
@@ -495,12 +507,8 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
int bch2_opts_check_may_set(struct bch_fs *c)
{
- unsigned i;
- int ret;
-
- for (i = 0; i < bch2_opts_nr; i++) {
- ret = bch2_opt_check_may_set(c, i,
- bch2_opt_get_by_id(&c->opts, i));
+ for (unsigned i = 0; i < bch2_opts_nr; i++) {
+ int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
if (ret)
return ret;
}
@@ -619,12 +627,25 @@ out:
return ret;
}
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
{
const struct bch_option *opt = bch2_opt_table + id;
u64 v;
- v = opt->get_sb(sb);
+ if (dev_idx < 0) {
+ v = opt->get_sb(sb);
+ } else {
+ if (WARN(!bch2_member_exists(sb, dev_idx),
+ "tried to set device option %s on nonexistent device %i",
+ opt->attr.name, dev_idx))
+ return 0;
+
+ struct bch_member m = bch2_sb_member_get(sb, dev_idx);
+ v = opt->get_member(&m);
+ }
+
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+ --v;
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = 1ULL << v;
@@ -641,35 +662,19 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
*/
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
{
- unsigned id;
-
- for (id = 0; id < bch2_opts_nr; id++) {
+ for (unsigned id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- if (opt->get_sb == BCH2_NO_SB_OPT)
- continue;
-
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+ if (opt->get_sb)
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
}
return 0;
}
-struct bch_dev_sb_opt_set {
- void (*set_sb)(struct bch_member *, u64);
-};
-
-static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
-#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
- BCH_DEV_OPT_SETTERS()
-#undef x
-};
-
void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
const struct bch_option *opt, u64 v)
{
- enum bch_opt_id id = opt - bch2_opt_table;
-
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -679,24 +684,16 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
v++;
- if (opt->flags & OPT_FS) {
- if (opt->set_sb != SET_BCH2_NO_SB_OPT)
- opt->set_sb(sb, v);
- }
+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0)
+ opt->set_sb(sb, v);
- if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+ if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
if (WARN(!bch2_member_exists(sb, dev_idx),
"tried to set device option %s on nonexistent device %i",
opt->attr.name, dev_idx))
return;
- struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
-
- const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
- if (set->set_sb)
- set->set_sb(m, v);
- else
- pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+ opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v);
}
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9d397fc2a1f0..bb621804d45a 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -50,10 +50,6 @@ static inline const char *bch2_d_type_str(unsigned d_type)
* apply the options from that struct that are defined.
*/
-/* dummy option, for options that aren't stored in the superblock */
-u64 BCH2_NO_SB_OPT(const struct bch_sb *);
-void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
-
/* When can be set: */
enum opt_flags {
OPT_FS = BIT(0), /* Filesystem option */
@@ -132,19 +128,24 @@ enum fsck_err_opts {
OPT_FS|OPT_FORMAT| \
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
OPT_UINT(512, 1U << 16), \
- BCH_SB_BLOCK_SIZE, 8, \
+ BCH_SB_BLOCK_SIZE, 4 << 10, \
"size", NULL) \
x(btree_node_size, u32, \
OPT_FS|OPT_FORMAT| \
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
OPT_UINT(512, 1U << 20), \
- BCH_SB_BTREE_NODE_SIZE, 512, \
+ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
"size", "Btree node size, default 256k") \
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
+ x(write_error_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 300), \
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
@@ -181,6 +182,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
+ x(checksum_err_retry_nr, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, 32), \
+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \
+ NULL, NULL) \
x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
@@ -197,7 +203,7 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \
@@ -308,11 +314,6 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
- x(discard, u8, \
- OPT_FS|OPT_MOUNT|OPT_DEVICE, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -493,27 +494,32 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, false, \
NULL, "Skip submit_bio() for data reads and writes, " \
"for performance testing purposes") \
- x(fs_size, u64, \
- OPT_DEVICE, \
+ x(state, u64, \
+ OPT_DEVICE|OPT_RUNTIME, \
+ OPT_STR(bch2_member_states), \
+ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
+ "state", "rw,ro,failed,spare") \
+ x(bucket_size, u32, \
+ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, 0, \
- "size", "Size of filesystem on device") \
- x(bucket, u32, \
- OPT_DEVICE, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, 0, \
+ BCH_MEMBER_BUCKET_SIZE, 0, \
"size", "Specifies the bucket size; must be greater than the btree node size")\
x(durability, u8, \
- OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
+ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
- BCH2_NO_SB_OPT, 1, \
+ BCH_MEMBER_DURABILITY, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
x(data_allowed, u8, \
OPT_DEVICE, \
OPT_BITFIELD(__bch2_data_types), \
- BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
"types", "Allowed data types for this device: journal, btree, and/or user")\
+ x(discard, u8, \
+ OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_MEMBER_DISCARD, true, \
+ NULL, "Enable discard/TRIM support") \
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -521,11 +527,6 @@ enum fsck_err_opts {
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
-#define BCH_DEV_OPT_SETTERS() \
- x(discard, BCH_MEMBER_DISCARD) \
- x(durability, BCH_MEMBER_DURABILITY) \
- x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
-
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@@ -582,8 +583,6 @@ struct printbuf;
struct bch_option {
struct attribute attr;
- u64 (*get_sb)(const struct bch_sb *);
- void (*set_sb)(struct bch_sb *, u64);
enum opt_type type;
enum opt_flags flags;
u64 min, max;
@@ -595,6 +594,12 @@ struct bch_option {
const char *hint;
const char *help;
+ u64 (*get_sb)(const struct bch_sb *);
+ void (*set_sb)(struct bch_sb *, u64);
+
+ u64 (*get_member)(const struct bch_member *);
+ void (*set_member)(struct bch_member *, u64);
+
};
extern const struct bch_option bch2_opt_table[];
@@ -603,7 +608,7 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
@@ -625,7 +630,7 @@ void bch2_opts_to_text(struct printbuf *,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
-int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
struct printbuf *, const char *, const char *);
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
new file mode 100644
index 000000000000..bafd1c91a802
--- /dev/null
+++ b/fs/bcachefs/progress.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "disk_accounting.h"
+#include "progress.h"
+
+void bch2_progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = i,
+ };
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
+ }
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
+
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
+ return ret;
+}
+
+void bch2_progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
new file mode 100644
index 000000000000..23fb1811f943
--- /dev/null
+++ b/fs/bcachefs/progress.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_PROGRESS_H
+#define _BCACHEFS_PROGRESS_H
+
+/*
+ * Lame progress indicators
+ *
+ * We don't like to use these because they print to the dmesg console, which is
+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via
+ * thread_with_file) and have it print the progress indicator.
+ *
+ * But some code is old and doesn't support that, or runs in a context where
+ * that's not yet practical (mount).
+ */
+
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
+void bch2_progress_update_iter(struct btree_trans *,
+ struct progress_indicator_state *,
+ struct btree_iter *,
+ const char *);
+
+#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index d0a1f5cd5c2b..29a569384146 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -26,9 +26,8 @@
/* bch_extent_rebalance: */
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
return NULL;
}
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
+}
+
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s_c k,
@@ -97,11 +101,12 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
if (!opts)
return 0;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
@@ -341,7 +346,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
if (!data_opts->rewrite_ptrs) {
/*
@@ -449,7 +454,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
{
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
return data_opts->rewrite_ptrs != 0;
}
@@ -590,8 +595,19 @@ static int bch2_rebalance_thread(void *arg)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
+ printbuf_tabstop_push(out, 32);
+
struct bch_fs_rebalance *r = &c->rebalance;
+ /* print pending work */
+ struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_rebalance_work, };
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+
+ prt_printf(out, "pending work:\t");
+ prt_human_readable_u64(out, v);
+ prt_printf(out, "\n\n");
+
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -600,15 +616,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
- prt_str(out, "io wait duration: ");
+ prt_printf(out, "io wait duration:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
- prt_str(out, "io wait remaining: ");
+ prt_printf(out, "io wait remaining:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
- prt_str(out, "duration waited: ");
+ prt_printf(out, "duration waited:\t");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
@@ -621,6 +637,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
break;
}
prt_newline(out);
+
+ rcu_read_lock();
+ struct task_struct *t = rcu_dereference(c->rebalance.thread);
+ if (t)
+ get_task_struct(t);
+ rcu_read_unlock();
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
+
printbuf_indent_sub(out, 2);
}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 71c786cdb192..266c5770c824 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -13,12 +13,12 @@
#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
-#include "fs-common.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
@@ -899,7 +899,7 @@ use_clean:
* journal sequence numbers:
*/
if (!c->sb.clean)
- journal_seq += 8;
+ journal_seq += JOURNAL_BUF_NR * 4;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 418557960ed6..e89b9c783285 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -24,7 +24,7 @@
x(check_topology, 4, 0) \
x(accounting_read, 39, PASS_ALWAYS) \
x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, PASS_ALWAYS) \
+ x(stripes_read, 1, 0) \
x(initialize_subvolumes, 2, 0) \
x(snapshots_read, 3, PASS_ALWAYS) \
x(check_allocations, 5, PASS_FSCK) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 441e648f28b5..68172c6eba21 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,12 +185,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
BUG_ON(missing_start < refd_start);
BUG_ON(missing_end > refd_end);
- if (fsck_err(trans, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- missing_start, missing_end)) {
+ struct bpos missing_pos = bkey_start_pos(p.k);
+ missing_pos.offset += missing_start - live_start;
+
+ prt_printf(&buf, "pointer to missing indirect extent in ");
+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
+ if (ret)
+ goto err;
+
+ prt_printf(&buf, "-%llu\n ", (missing_pos.offset + (missing_end - missing_start)) << 9);
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+
+ prt_printf(&buf, "\n missing reflink btree range %llu-%llu",
+ missing_start, missing_end);
+
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
@@ -597,7 +606,7 @@ s64 bch2_remap_range(struct bch_fs *c,
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
bool reflink_p_may_update_opts_field =
- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
index 6992e7469112..2b4b8445d418 100644
--- a/fs/bcachefs/sb-counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -5,7 +5,13 @@
/* BCH_SB_FIELD_counters */
-static const char * const bch2_counter_names[] = {
+static const u8 counters_to_stable_map[] = {
+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+};
+
+const char * const bch2_counter_names[] = {
#define x(t, n, ...) (#t),
BCH_PERSISTENT_COUNTERS()
#undef x
@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
return 0;
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-};
+}
static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
-};
+}
static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- for (unsigned i = 0; i < nr; i++)
- prt_printf(out, "%s \t%llu\n",
- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
- le64_to_cpu(ctrs->d[i]));
-};
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr)
+ prt_printf(out, "%s \t%llu\n",
+ bch2_counter_names[i],
+ le64_to_cpu(ctrs->d[stable]));
+ }
+}
int bch2_sb_counters_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- u64 val = 0;
- for (i = 0; i < BCH_COUNTER_NR; i++)
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
c->counters_on_mount[i] = 0;
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
- val = le64_to_cpu(ctrs->d[i]);
- percpu_u64_set(&c->counters[i], val);
- c->counters_on_mount[i] = val;
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr) {
+ u64 v = le64_to_cpu(ctrs->d[stable]);
+ percpu_u64_set(&c->counters[i], v);
+ c->counters_on_mount[i] = v;
+ }
}
+
return 0;
-};
+}
int bch2_sb_counters_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
struct bch_sb_field_counters *ret;
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
if (nr < BCH_COUNTER_NR) {
ret = bch2_sb_field_resize(&c->disk_sb, counters,
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
-
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
if (ret) {
ctrs = ret;
nr = bch2_sb_counter_nr_entries(ctrs);
}
}
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr)
+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+ }
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
return 0;
}
@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = {
.validate = bch2_sb_counters_validate,
.to_text = bch2_sb_counters_to_text,
};
+
+#ifndef NO_BCACHEFS_CHARDEV
+long bch2_ioctl_query_counters(struct bch_fs *c,
+ struct bch_ioctl_query_counters __user *user_arg)
+{
+ struct bch_ioctl_query_counters arg;
+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
+ if (ret)
+ return ret;
+
+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
+ arg.pad)
+ return -EINVAL;
+
+ arg.nr = min(arg.nr, BCH_COUNTER_NR);
+ ret = put_user(arg.nr, &user_arg->nr);
+ if (ret)
+ return ret;
+
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+
+ if (stable < arg.nr) {
+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
+ ? percpu_u64_get(&c->counters[i])
+ : c->counters_on_mount[i];
+
+ ret = put_user(v, &user_arg->d[stable]);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
index 81f8aec9fcb1..a4329ad8dd1b 100644
--- a/fs/bcachefs/sb-counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *);
void bch2_fs_counters_exit(struct bch_fs *);
int bch2_fs_counters_init(struct bch_fs *);
+extern const char * const bch2_counter_names[];
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+long bch2_ioctl_query_counters(struct bch_fs *,
+ struct bch_ioctl_query_counters __user *);
+
#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index fdcf598f08b1..fa27ec59a647 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -9,10 +9,24 @@ enum counters_flags {
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0, TYPE_SECTORS) \
+ x(io_read_inline, 80, TYPE_SECTORS) \
+ x(io_read_hole, 81, TYPE_SECTORS) \
+ x(io_read_promote, 30, TYPE_COUNTER) \
+ x(io_read_bounce, 31, TYPE_COUNTER) \
+ x(io_read_split, 33, TYPE_COUNTER) \
+ x(io_read_reuse_race, 34, TYPE_COUNTER) \
+ x(io_read_retry, 32, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
+ x(io_move_read, 35, TYPE_SECTORS) \
+ x(io_move_write, 36, TYPE_SECTORS) \
+ x(io_move_finish, 37, TYPE_SECTORS) \
+ x(io_move_fail, 38, TYPE_COUNTER) \
+ x(io_move_write_fail, 82, TYPE_COUNTER) \
+ x(io_move_start_fail, 39, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
+ x(bucket_discard_fast, 79, TYPE_COUNTER) \
x(bucket_alloc, 5, TYPE_COUNTER) \
x(bucket_alloc_fail, 6, TYPE_COUNTER) \
x(btree_cache_scan, 7, TYPE_COUNTER) \
@@ -38,16 +52,6 @@ enum counters_flags {
x(journal_reclaim_finish, 27, TYPE_COUNTER) \
x(journal_reclaim_start, 28, TYPE_COUNTER) \
x(journal_write, 29, TYPE_COUNTER) \
- x(read_promote, 30, TYPE_COUNTER) \
- x(read_bounce, 31, TYPE_COUNTER) \
- x(read_split, 33, TYPE_COUNTER) \
- x(read_retry, 32, TYPE_COUNTER) \
- x(read_reuse_race, 34, TYPE_COUNTER) \
- x(move_extent_read, 35, TYPE_SECTORS) \
- x(move_extent_write, 36, TYPE_SECTORS) \
- x(move_extent_finish, 37, TYPE_SECTORS) \
- x(move_extent_fail, 38, TYPE_COUNTER) \
- x(move_extent_start_fail, 39, TYPE_COUNTER) \
x(copygc, 40, TYPE_COUNTER) \
x(copygc_wait, 41, TYPE_COUNTER) \
x(gc_gens_end, 42, TYPE_COUNTER) \
@@ -95,6 +99,13 @@ enum bch_persistent_counters {
BCH_COUNTER_NR
};
+enum bch_persistent_counters_stable {
+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_STABLE_NR
+};
+
struct bch_sb_field_counters {
struct bch_sb_field field;
__le64 d[];
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 051214fdc735..acb5d845841e 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -90,7 +90,13 @@
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end)
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(cached_backpointers, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(stripe_backpointers, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_ptr_to_missing_backpointer)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index b86ec013d7d7..67455beb8358 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -179,6 +179,7 @@ enum bch_fsck_flags {
x(ptr_crc_redundant, 160, 0) \
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
+ x(extent_flags_not_at_start, 306, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
@@ -314,7 +315,9 @@ enum bch_fsck_flags {
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
- x(MAX, 304, 0)
+ x(dirent_cf_name_too_big, 304, 0) \
+ x(dirent_stray_data_after_cf_name, 305, 0) \
+ x(MAX, 307, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 762083b564ee..38261638a611 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -23,7 +23,19 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
return !percpu_ref_is_zero(&ca->io_ref);
}
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
+
+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
+{
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ bool ret = ca && bch2_dev_is_online(ca);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_failed;
@@ -271,6 +283,8 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
{
+ might_sleep();
+
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
if (ca && !percpu_ref_tryget(&ca->io_ref))
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index 2adf1221a440..3affec823b3f 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -79,6 +79,7 @@ struct bch_member {
#define BCH_MEMBER_V1_BYTES 56
+LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index c54091a28909..e7f197896db1 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -146,8 +146,9 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
goto out;
}
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
- id = get_ancestor_below(t, id, ancestor);
+ if (likely(ancestor >= IS_ANCESTOR_BITMAP))
+ while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+ id = get_ancestor_below(t, id, ancestor);
ret = id && id < ancestor
? test_ancestor_bitmap(t, id, ancestor)
@@ -389,7 +390,7 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
return 0;
}
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
{
u32 id = snapshot_root;
u32 subvol = 0, s;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 00373cf32e7b..81180181d7c9 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
return id;
}
+u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32);
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index d78451c2a0c6..93e71119e5a4 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
for (unsigned i = 0; i < 1000; i++) {
unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
if (u64s > U8_MAX)
return -EINVAL;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 55a4ac7bf220..575ad1e03904 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -12,7 +12,6 @@
#include "super.h"
#include <linux/crc32c.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
static inline enum bch_str_hash_type
@@ -34,6 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u8 type;
+ struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@@ -47,17 +47,17 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
/* XXX ick */
struct bch_hash_info info = {
.type = INODE_STR_HASH(bi),
+#ifdef CONFIG_UNICODE
+ .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL,
+#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
- SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
- desc->tfm = c->sha256;
-
- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
- sizeof(bi->bi_hash_seed), digest);
+ sha256((const u8 *)&bi->bi_hash_seed,
+ sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index a81a7b6c0989..572b06bfa0b8 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -25,9 +25,6 @@
#include <linux/sort.h>
#include <linux/string_choices.h>
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-};
-
struct bch2_metadata_version {
u16 version;
const char *name;
@@ -69,12 +66,14 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
return v;
}
-bool bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
{
- bool ret = (c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
- version <= c->sb.version_incompat_allowed;
+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
+ version <= c->sb.version_incompat_allowed)
+ ? 0
+ : -BCH_ERR_may_not_use_incompat_feature;
- if (ret) {
+ if (!ret) {
mutex_lock(&c->sb_lock);
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
@@ -366,39 +365,41 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
- enum bch_validate_flags flags, struct printbuf *out)
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+ enum bch_validate_flags flags, struct printbuf *out)
{
- struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
- u16 block_size;
int ret;
ret = bch2_sb_compatible(sb, out);
if (ret)
return ret;
- if (sb->features[1] ||
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
- prt_printf(out, "Filesystem has incompatible features");
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
return -BCH_ERR_invalid_sb_features;
}
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_printf(out, "Filesystem has incompatible version");
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
return -BCH_ERR_invalid_sb_features;
}
- block_size = le16_to_cpu(sb->block_size);
-
- if (block_size > PAGE_SECTORS) {
- prt_printf(out, "Block size too big (got %u, max %u)",
- block_size, PAGE_SECTORS);
- return -BCH_ERR_invalid_sb_block_size;
- }
-
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
prt_printf(out, "Bad user UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;
@@ -409,6 +410,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_uuid;
}
+ if (!(flags & BCH_VALIDATE_write) &&
+ le64_to_cpu(sb->offset) != read_offset) {
+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
+ le64_to_cpu(sb->offset), read_offset);
+ return -BCH_ERR_invalid_sb_offset;
+ }
+
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
prt_printf(out, "Bad number of member devices %u (max %u)",
@@ -464,6 +472,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+ !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
}
#ifdef __KERNEL__
@@ -474,8 +489,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;
- if (opt->get_sb != BCH2_NO_SB_OPT) {
- u64 v = bch2_opt_from_sb(sb, opt_id);
+ if (opt->get_sb) {
+ u64 v = bch2_opt_from_sb(sb, opt_id, -1);
prt_printf(out, "Invalid option ");
ret = bch2_opt_validate(opt, v, out);
@@ -755,7 +770,7 @@ retry:
memset(sb, 0, sizeof(*sb));
sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
- sb->holder = kmalloc(1, GFP_KERNEL);
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
if (!sb->holder)
return -ENOMEM;
@@ -881,7 +896,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb, 0, &err);
+ ret = bch2_sb_validate(sb->sb, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -918,16 +933,16 @@ static void write_super_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
+
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "superblock %s error: %s",
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "superblock %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
ca->sb_write_error = 1;
+ }
closure_put(&ca->fs->sb_write);
percpu_ref_put(&ca->io_ref);
@@ -1038,7 +1053,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
@@ -1166,7 +1181,7 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
- ret = -1;
+ ret = -BCH_ERR_erofs_sb_err;
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
@@ -1223,12 +1238,11 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
c->disk_sb.sb->version = cpu_to_le16(new_version);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
if (incompat) {
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
}
}
@@ -1459,8 +1473,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
for (id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- if (opt->get_sb != BCH2_NO_SB_OPT) {
- u64 v = bch2_opt_from_sb(sb, id);
+ if (opt->get_sb) {
+ u64 v = bch2_opt_from_sb(sb, id, -1);
prt_printf(out, "%s:\t", opt->attr.name);
bch2_opt_to_text(out, NULL, sb, opt, v,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index b4cff9ebdebb..78f708a6fbcd 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -21,13 +21,13 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-bool bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
- enum bcachefs_metadata_version version)
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
+ enum bcachefs_metadata_version version)
{
return likely(version <= c->sb.version_incompat)
- ? true
+ ? 0
: bch2_set_version_incompat(c, version);
}
@@ -92,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0459c875e189..99f9a0aaa380 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -75,9 +75,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: crc32c");
-MODULE_SOFTDEP("pre: crc64");
-MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
@@ -718,7 +715,7 @@ static int bch2_fs_online(struct bch_fs *c)
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
- bch2_opts_create_sysfs_files(&c->opts_dir);
+ bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
if (ret) {
bch_err(c, "error creating sysfs objects");
return ret;
@@ -837,6 +834,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
+#ifdef CONFIG_UNICODE
+ /* Default encoding until we can potentially have more as an option. */
+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
+ if (IS_ERR(c->cf_encoding)) {
+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+ ret = -EINVAL;
+ goto err;
+ }
+#else
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
+ printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
+ ret = -EINVAL;
+ goto err;
+ }
+#endif
+
pr_uuid(&name, c->sb.user_uuid.b);
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
@@ -1056,6 +1072,7 @@ int bch2_fs_start(struct bch_fs *c)
}
set_bit(BCH_FS_started, &c->flags);
+ wake_up(&c->ro_ref_wait);
if (c->opts.read_only) {
bch2_fs_read_only(c);
@@ -1280,8 +1297,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
return 0;
if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &c->kobj,
- "dev-%u", ca->dev_idx);
+ ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
+ bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
if (ret)
return ret;
}
@@ -1412,6 +1429,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
+ /*
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
+ * attached to a filesystem, we will always close the block device
+ * before tearing down the filesystem object.
+ */
+ ca->disk_sb.holder->c = ca->fs;
+
ca->dev = ca->disk_sb.bdev->bd_dev;
percpu_ref_reinit(&ca->io_ref);
@@ -1966,15 +1990,12 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = BCH_DATA_free,
- };
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = BCH_DATA_free)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
goto err;
@@ -1998,6 +2019,102 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
+/* blk_holder_ops: */
+
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
+{
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
+ struct bch_fs *c = holder->c;
+
+ if (c && !bch2_ro_ref_tryget(c))
+ c = NULL;
+
+ mutex_unlock(&bdev->bd_holder_lock);
+
+ if (c)
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
+ return c;
+}
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
+{
+ for_each_member_device(c, ca)
+ if (ca->disk_sb.bdev == bdev)
+ return ca;
+ return NULL;
+}
+
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ }
+
+ down_write(&c->state_lock);
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto unlock;
+
+ if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
+ __bch2_dev_offline(c, ca);
+ } else {
+ if (sb) {
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ evict_inodes(sb);
+ }
+
+ bch2_journal_flush(&c->journal);
+ bch2_fs_emergency_read_only(c);
+ }
+
+ bch2_dev_put(ca);
+unlock:
+ if (sb)
+ up_read(&sb->s_umount);
+ up_write(&c->state_lock);
+ bch2_ro_ref_put(c);
+}
+
+static void bch2_fs_bdev_sync(struct block_device *bdev)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ sync_filesystem(sb);
+ up_read(&sb->s_umount);
+ }
+
+ bch2_ro_ref_put(c);
+}
+
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+ .mark_dead = bch2_fs_bdev_mark_dead,
+ .sync = bch2_fs_bdev_sync,
+};
+
/* Filesystem open: */
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 04f8287eff5c..23533bce5709 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -42,4 +42,6 @@ void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
+
#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 368a63d938cf..3a899f799d1d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -2,13 +2,19 @@
#ifndef _BCACHEFS_SUPER_TYPES_H
#define _BCACHEFS_SUPER_TYPES_H
+struct bch_fs;
+
+struct bch_sb_handle_holder {
+ struct bch_fs *c;
+};
+
struct bch_sb_handle {
struct bch_sb *sb;
struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
- void *holder;
+ struct bch_sb_handle_holder *holder;
size_t buffer_size;
blk_mode_t mode;
unsigned have_layout:1;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a7eb1f511484..251ba8224c1f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -146,15 +146,14 @@ write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
write_attribute(trigger_freelist_wakeup);
+write_attribute(trigger_btree_updates);
read_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
read_attribute(flags);
-read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
-rw_attribute(durability);
read_attribute(io_done);
read_attribute(io_errors);
write_attribute(io_errors_reset);
@@ -173,10 +172,8 @@ read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_reserve_cache);
-read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
-read_attribute(write_points);
read_attribute(nocow_lock_table);
#ifdef BCH_WRITE_REF_DEBUG
@@ -209,8 +206,6 @@ read_attribute(usage_base);
BCH_PERSISTENT_COUNTERS()
#undef x
-rw_attribute(discard);
-read_attribute(state);
rw_attribute(label);
read_attribute(copy_gc_wait);
@@ -355,18 +350,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_reserve_cache)
bch2_btree_reserve_cache_to_text(out, c);
- if (attr == &sysfs_stripes_heap)
- bch2_stripes_heap_to_text(out, c);
-
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, NULL);
if (attr == &sysfs_open_buckets_partial)
bch2_open_buckets_partial_to_text(out, c);
- if (attr == &sysfs_write_points)
- bch2_write_points_to_text(out, c);
-
if (attr == &sysfs_compression_stats)
bch2_compression_stats_to_text(out, c);
@@ -415,6 +404,9 @@ STORE(bch2_fs)
/* Debugging: */
+ if (attr == &sysfs_trigger_btree_updates)
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
+
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
return -EROFS;
@@ -566,10 +558,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_btree_key_cache,
&sysfs_btree_reserve_cache,
&sysfs_new_stripes,
- &sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_open_buckets_partial,
- &sysfs_write_points,
#ifdef BCH_WRITE_REF_DEBUG
&sysfs_write_refs,
#endif
@@ -585,6 +575,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
&sysfs_trigger_freelist_wakeup,
+ &sysfs_trigger_btree_updates,
&sysfs_gc_gens_pos,
@@ -604,26 +595,34 @@ struct attribute *bch2_fs_internal_files[] = {
/* options */
-SHOW(bch2_fs_opts_dir)
+static ssize_t sysfs_opt_show(struct bch_fs *c,
+ struct bch_dev *ca,
+ enum bch_opt_id id,
+ struct printbuf *out)
{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
- int id = opt - bch2_opt_table;
- u64 v = bch2_opt_get_by_id(&c->opts, id);
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ if (opt->flags & OPT_FS) {
+ v = bch2_opt_get_by_id(&c->opts, id);
+ } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
+ v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
+ } else {
+ return -EINVAL;
+ }
bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
prt_char(out, '\n');
-
return 0;
}
-STORE(bch2_fs_opts_dir)
+static ssize_t sysfs_opt_store(struct bch_fs *c,
+ struct bch_dev *ca,
+ enum bch_opt_id id,
+ const char *buf, size_t size)
{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
- int ret, id = opt - bch2_opt_table;
- char *tmp;
- u64 v;
+ const struct bch_option *opt = bch2_opt_table + id;
+ int ret = 0;
/*
* We don't need to take c->writes for correctness, but it eliminates an
@@ -632,27 +631,28 @@ STORE(bch2_fs_opts_dir)
if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
return -EROFS;
- tmp = kstrdup(buf, GFP_KERNEL);
+ down_write(&c->state_lock);
+
+ char *tmp = kstrdup(buf, GFP_KERNEL);
if (!tmp) {
ret = -ENOMEM;
goto err;
}
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+ u64 v;
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
+ bch2_opt_check_may_set(c, ca, id, v);
kfree(tmp);
if (ret < 0)
goto err;
- ret = bch2_opt_check_may_set(c, id, v);
- if (ret < 0)
- goto err;
-
- bch2_opt_set_sb(c, NULL, opt, v);
+ bch2_opt_set_sb(c, ca, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if (v &&
(id == Opt_background_target ||
+ (id == Opt_foreground_target && !c->opts.background_target) ||
id == Opt_background_compression ||
(id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
@@ -664,27 +664,56 @@ STORE(bch2_fs_opts_dir)
c->copygc_thread)
wake_up_process(c->copygc_thread);
+ if (id == Opt_discard && !ca) {
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca)
+ opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v);
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
ret = size;
err:
+ up_write(&c->state_lock);
bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return ret;
}
+
+SHOW(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ int id = bch2_opt_lookup(attr->name);
+ if (id < 0)
+ return 0;
+
+ return sysfs_opt_show(c, NULL, id, out);
+}
+
+STORE(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ int id = bch2_opt_lookup(attr->name);
+ if (id < 0)
+ return 0;
+
+ return sysfs_opt_store(c, NULL, id, buf, size);
+}
SYSFS_OPS(bch2_fs_opts_dir);
struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-int bch2_opts_create_sysfs_files(struct kobject *kobj)
+int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
{
- const struct bch_option *i;
- int ret;
-
- for (i = bch2_opt_table;
+ for (const struct bch_option *i = bch2_opt_table;
i < bch2_opt_table + bch2_opts_nr;
i++) {
- if (!(i->flags & OPT_FS))
+ if (i->flags & OPT_HIDDEN)
+ continue;
+ if (!(i->flags & type))
continue;
- ret = sysfs_create_file(kobj, &i->attr);
+ int ret = sysfs_create_file(kobj, &i->attr);
if (ret)
return ret;
}
@@ -755,11 +784,8 @@ SHOW(bch2_dev)
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
- sysfs_print(bucket_size, bucket_bytes(ca));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
- sysfs_print(durability, ca->mi.durability);
- sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
if (ca->mi.group)
@@ -772,11 +798,6 @@ SHOW(bch2_dev)
prt_char(out, '\n');
}
- if (attr == &sysfs_state) {
- prt_string_option(out, bch2_member_states, ca->mi.state);
- prt_char(out, '\n');
- }
-
if (attr == &sysfs_io_done)
dev_io_done_to_text(out, ca);
@@ -802,6 +823,10 @@ SHOW(bch2_dev)
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, ca);
+ int opt_id = bch2_opt_lookup(attr->name);
+ if (opt_id >= 0)
+ return sysfs_opt_show(c, ca, opt_id, out);
+
return 0;
}
@@ -810,18 +835,6 @@ STORE(bch2_dev)
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
- }
-
- if (attr == &sysfs_durability) {
- u64 v = strtoul_or_return(buf);
-
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
- }
-
if (attr == &sysfs_label) {
char *tmp;
int ret;
@@ -839,20 +852,20 @@ STORE(bch2_dev)
if (attr == &sysfs_io_errors_reset)
bch2_dev_errors_reset(ca);
+ int opt_id = bch2_opt_lookup(attr->name);
+ if (opt_id >= 0)
+ return sysfs_opt_store(c, ca, opt_id, buf, size);
+
return size;
}
SYSFS_OPS(bch2_dev);
struct attribute *bch2_dev_files[] = {
&sysfs_uuid,
- &sysfs_bucket_size,
&sysfs_first_bucket,
&sysfs_nbuckets,
- &sysfs_durability,
/* settings: */
- &sysfs_discard,
- &sysfs_state,
&sysfs_label,
&sysfs_has_data,
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
index 222cd5062702..303e0433c702 100644
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
@@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
extern const struct sysfs_ops bch2_dev_sysfs_ops;
-int bch2_opts_create_sysfs_files(struct kobject *);
+int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
#else
@@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
static const struct sysfs_ops bch2_dev_sysfs_ops;
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
+{ return 0; }
#endif /* NO_BCACHEFS_SYSFS */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c1b51009edf6..519d00d62ae7 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -295,12 +295,12 @@ TRACE_EVENT(write_super,
/* io.c: */
-DEFINE_EVENT(bio, read_promote,
+DEFINE_EVENT(bio, io_read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-TRACE_EVENT(read_nopromote,
+TRACE_EVENT(io_read_nopromote,
TP_PROTO(struct bch_fs *c, int ret),
TP_ARGS(c, ret),
@@ -319,26 +319,50 @@ TRACE_EVENT(read_nopromote,
__entry->ret)
);
-DEFINE_EVENT(bio, read_bounce,
+DEFINE_EVENT(bio, io_read_bounce,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, io_read_split,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_retry,
+DEFINE_EVENT(bio, io_read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_reuse_race,
+DEFINE_EVENT(bio, io_read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
+/* ec.c */
+
+TRACE_EVENT(stripe_create,
+ TP_PROTO(struct bch_fs *c, u64 idx, int ret),
+ TP_ARGS(c, idx, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, idx )
+ __field(int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->idx = idx;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%d,%d idx %llu ret %i",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->idx,
+ __entry->ret)
+);
+
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,
@@ -797,53 +821,37 @@ TRACE_EVENT(bucket_invalidate,
/* Moving IO */
-TRACE_EVENT(bucket_evacuate,
- TP_PROTO(struct bch_fs *c, struct bpos *bucket),
- TP_ARGS(c, bucket),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, dev_idx )
- __field(u64, bucket )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->dev_idx = bucket->inode;
- __entry->bucket = bucket->offset;
- ),
-
- TP_printk("%d:%d %u:%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->dev_idx, __entry->bucket)
+DEFINE_EVENT(fs_str, io_move,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent,
+DEFINE_EVENT(fs_str, io_move_read,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_read,
+DEFINE_EVENT(fs_str, io_move_write,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_write,
+DEFINE_EVENT(fs_str, io_move_finish,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_finish,
+DEFINE_EVENT(fs_str, io_move_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_fail,
+DEFINE_EVENT(fs_str, io_move_write_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_start_fail,
+DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@@ -881,37 +889,6 @@ TRACE_EVENT(move_data,
__entry->sectors_raced)
);
-TRACE_EVENT(evacuate_bucket,
- TP_PROTO(struct bch_fs *c, struct bpos *bucket,
- unsigned sectors, unsigned bucket_size,
- int ret),
- TP_ARGS(c, bucket, sectors, bucket_size, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, member )
- __field(u64, bucket )
- __field(u32, sectors )
- __field(u32, bucket_size )
- __field(int, ret )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->member = bucket->inode;
- __entry->bucket = bucket->offset;
- __entry->sectors = sectors;
- __entry->bucket_size = bucket_size;
- __entry->ret = ret;
- ),
-
- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->member, __entry->bucket,
- __entry->sectors, __entry->bucket_size,
- __entry->ret)
-);
-
TRACE_EVENT(copygc,
TP_PROTO(struct bch_fs *c,
u64 buckets,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index da2cd11b3025..553de8d8e3e5 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -473,10 +473,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+ eytzinger0_for_each(j, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
- u64 q = max(quantiles->entries[i].m, last_q);
+ u64 q = max(quantiles->entries[j].m, last_q);
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
@@ -704,12 +704,33 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *bio)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned u64s = bv.bv_len / sizeof(u64);
+
+ if (offset < u64s) {
+ u64 *segment = bvec_kmap_local(&bv);
+ segment[offset] = get_random_u64();
+ kunmap_local(segment);
+ return;
+ }
+ offset -= u64s;
+ }
+}
+#endif
+
#if 0
void eytzinger1_test(void)
{
- unsigned inorder, eytz, size;
+ unsigned inorder, size;
- pr_info("1 based eytzinger test:");
+ pr_info("1 based eytzinger test:\n");
for (size = 2;
size < 65536;
@@ -717,13 +738,7 @@ void eytzinger1_test(void)
unsigned extra = eytzinger1_extra(size);
if (!(size % 4096))
- pr_info("tree size %u", size);
-
- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
-
- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
+ pr_info("tree size %u\n", size);
inorder = 1;
eytzinger1_for_each(eytz, size) {
@@ -734,15 +749,16 @@ void eytzinger1_test(void)
inorder++;
}
+ BUG_ON(inorder - 1 != size);
}
}
void eytzinger0_test(void)
{
- unsigned inorder, eytz, size;
+ unsigned inorder, size;
- pr_info("0 based eytzinger test:");
+ pr_info("0 based eytzinger test:\n");
for (size = 1;
size < 65536;
@@ -750,13 +766,7 @@ void eytzinger0_test(void)
unsigned extra = eytzinger0_extra(size);
if (!(size % 4096))
- pr_info("tree size %u", size);
-
- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
-
- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
+ pr_info("tree size %u\n", size);
inorder = 0;
eytzinger0_for_each(eytz, size) {
@@ -767,54 +777,191 @@ void eytzinger0_test(void)
inorder++;
}
+ BUG_ON(inorder != size);
+
+ inorder = size - 1;
+ eytzinger0_for_each_prev(eytz, size) {
+ BUG_ON(eytz != eytzinger0_first(size) &&
+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
+
+ inorder--;
+ }
+ BUG_ON(inorder != -1);
}
}
-static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+static inline int cmp_u16(const void *_l, const void *_r)
{
const u16 *l = _l, *r = _r;
- return (*l > *r) - (*r - *l);
+ return (*l > *r) - (*r > *l);
}
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
{
- int i, c1 = -1, c2 = -1;
- ssize_t r;
+ int r, s;
+ bool bad;
r = eytzinger0_find_le(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
- if (r >= 0)
- c1 = test_array[r];
-
- for (i = 0; i < nr; i++)
- if (test_array[i] <= search && test_array[i] > c2)
- c2 = test_array[i];
-
- if (c1 != c2) {
- eytzinger0_for_each(i, nr)
- pr_info("[%3u] = %12u", i, test_array[i]);
- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
- i, r, c1, c2);
+ if (r >= 0) {
+ if (test_array[r] > search) {
+ bad = true;
+ } else {
+ s = eytzinger0_next(r, nr);
+ bad = s >= 0 && test_array[s] <= search;
+ }
+ } else {
+ s = eytzinger0_last(nr);
+ bad = s >= 0 && test_array[s] <= search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each_prev(j, nr) {
+ if (test_array[j] <= search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_le(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
}
}
+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
+{
+ int r, s;
+ bool bad;
+
+ r = eytzinger0_find_gt(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0) {
+ if (test_array[r] <= search) {
+ bad = true;
+ } else {
+ s = eytzinger0_prev(r, nr);
+ bad = s >= 0 && test_array[s] > search;
+ }
+ } else {
+ s = eytzinger0_first(nr);
+ bad = s >= 0 && test_array[s] > search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each(j, nr) {
+ if (test_array[j] > search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_gt(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
+{
+ int r, s;
+ bool bad;
+
+ r = eytzinger0_find_ge(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0) {
+ if (test_array[r] < search) {
+ bad = true;
+ } else {
+ s = eytzinger0_prev(r, nr);
+ bad = s >= 0 && test_array[s] >= search;
+ }
+ } else {
+ s = eytzinger0_first(nr);
+ bad = s >= 0 && test_array[s] >= search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each(j, nr) {
+ if (test_array[j] >= search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_ge(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
+{
+ unsigned r;
+ int s;
+ bool bad;
+
+ r = eytzinger0_find(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+
+ if (r < nr) {
+ bad = test_array[r] != search;
+ } else {
+ s = eytzinger0_find_le(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ bad = s >= 0 && test_array[s] == search;
+ }
+
+ if (bad) {
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find(%12u) = %3i is incorrect\n",
+ search, r);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+ eytzinger0_find_test_le(test_array, nr, search);
+ eytzinger0_find_test_gt(test_array, nr, search);
+ eytzinger0_find_test_ge(test_array, nr, search);
+ eytzinger0_find_test_eq(test_array, nr, search);
+}
+
void eytzinger0_find_test(void)
{
unsigned i, nr, allocated = 1 << 12;
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
for (nr = 1; nr < allocated; nr++) {
- pr_info("testing %u elems", nr);
+ u16 prev = 0;
+
+ pr_info("testing %u elems\n", nr);
get_random_bytes(test_array, nr * sizeof(test_array[0]));
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
/* verify array is sorted correctly: */
- eytzinger0_for_each(i, nr)
- BUG_ON(i != eytzinger0_last(nr) &&
- test_array[i] > test_array[eytzinger0_next(i, nr)]);
+ eytzinger0_for_each(j, nr) {
+ BUG_ON(test_array[j] < prev);
+ prev = test_array[j];
+ }
for (i = 0; i < U16_MAX; i += 1 << 12)
eytzinger0_find_test_val(test_array, nr, i);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index f4a4783219d9..7d921fc920a0 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -406,6 +406,18 @@ u64 bch2_get_random_u64_below(u64);
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *);
+
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
+{
+ if (ratio && !get_random_u32_below(ratio))
+ bch2_corrupt_bio(bio);
+}
+#else
+#define bch2_maybe_corrupt_bio(...) do {} while (0)
+#endif
+
static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s)
{
@@ -419,7 +431,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src,
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("rep ; movsq"
@@ -496,7 +508,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
u64 *dst = (u64 *) _dst + u64s - 1;
u64 *src = (u64 *) _src + u64s - 1;
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("std ;\n"
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index aed7c6984173..f9667b944c0d 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -523,7 +523,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (ret < 0)
goto err_class_exit;
- ret = bch2_opt_check_may_set(c, opt_id, v);
+ ret = bch2_opt_check_may_set(c, NULL, opt_id, v);
if (ret < 0)
goto err_class_exit;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8054f44d39cf..584fa89bc877 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz,
}
#define NOTE_DATA_SZ SZ_1K
-#define GNU_PROPERTY_TYPE_0_NAME "GNU"
-#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
+#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0))
static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
struct arch_elf_state *arch)
@@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
note.nhdr.n_namesz != NOTE_NAME_SZ ||
strncmp(note.data + sizeof(note.nhdr),
- GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
+ NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr)))
return -ENOEXEC;
off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
@@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
const kernel_siginfo_t *siginfo)
{
copy_siginfo_to_external(csigdata, siginfo);
- fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+ fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
}
/*
@@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
}
size = name_curpos - (char *)data;
- fill_note(note, "CORE", NT_FILE, size, data);
+ fill_note(note, NN_FILE, NT_FILE, size, data);
return 0;
}
@@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
@@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
note_type, ret, data);
info->size += notesize(&t->notes[note_iter]);
@@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
&(t->prstatus));
info->size += notesize(&t->notes[0]);
@@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
}
t->prstatus.pr_fpvalid = 1;
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
return 1;
@@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
- fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
#ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c13ee8180b17..9133f3827f90 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
/* deal with each load segment separately */
phdr = params->phdrs;
for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
- unsigned long maddr, disp, excess, excess1;
+ unsigned long maddr, disp, excess;
int prot = 0, flags;
if (phdr->p_type != PT_LOAD)
@@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
* extant in the file
*/
excess = phdr->p_memsz - phdr->p_filesz;
- excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
#ifdef CONFIG_MMU
+ unsigned long excess1
+ = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
if (excess > excess1) {
unsigned long xaddr = maddr + phdr->p_filesz + excess1;
unsigned long xmaddr;
@@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
&t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
}
if (t->prstatus.pr_fpvalid) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+ fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu),
&t->fpu);
t->num_notes++;
*sz += notesize(&t->notes[1]);
@@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
*/
fill_psinfo(psinfo, current->group_leader, current->mm);
- fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo);
thread_status_size += notesize(&psinfo_note);
auxv = (elf_addr_t *) current->mm->saved_auxv;
@@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
offset = sizeof(*elf); /* ELF header */
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 7a7e0ef69973..15ea6348800b 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>
+#include "extent_io.h"
struct extent_buffer;
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 48b9ddae4a46..0458cd51ed48 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+#include <linux/types.h>
+
struct posix_acl;
struct inode;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index a4c51600a408..f3bffe08b290 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
- int need_change = 0;
+ bool need_change = false;
if (wq->thresh == NO_THRESHOLD)
return;
@@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
new_current_active--;
new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
if (new_current_active != wq->current_active) {
- need_change = 1;
+ need_change = true;
wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
- if (need_change) {
+ if (need_change)
workqueue_set_max_active(wq->normal_wq, wq->current_active);
- }
}
static void run_ordered_work(struct btrfs_workqueue *wq,
@@ -296,7 +295,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct btrfs_workqueue *wq = work->wq;
- int need_order = 0;
+ bool need_order = false;
/*
* We should not touch things inside work in the following cases:
@@ -307,7 +306,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
* So we save the needed things here.
*/
if (work->ordered_func)
- need_order = 1;
+ need_order = true;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3d3923cfc357..5936cff80ff3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
ASSERT(ctx->roots == NULL);
key.objectid = ctx->bytenr;
- key.offset = (u64)-1;
if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
path = btrfs_alloc_path();
if (!path)
@@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_extent_item *ei;
struct btrfs_key key;
+ key.objectid = logical;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index bc2555c44a12..8c2eee1f1878 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
return bbio;
}
-/* Free a bio that was never submitted to the underlying device. */
-static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio))
- btrfs_put_ordered_extent(bbio->ordered);
- bio_put(&bbio->bio);
-}
-
-static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio)) {
- struct btrfs_ordered_extent *ordered = bbio->ordered;
-
- bbio->end_io(bbio);
- btrfs_put_ordered_extent(ordered);
- } else {
- bbio->end_io(bbio);
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- btrfs_cleanup_bio(bbio);
+ /* Free bio that was never submitted to the underlying device. */
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+
bbio = orig_bbio;
}
@@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
/* Load split bio's error which might be set above. */
if (status == BLK_STS_OK)
bbio->bio.bi_status = READ_ONCE(bbio->status);
- __btrfs_bio_end_io(bbio);
+
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
+
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
}
}
@@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
+ btrfs_bio_end_io(async->bbio, bio->bi_status);
return;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c0a8f7d92acc..a8129f1ce78c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new,
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group *block_group)
+static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct rb_node *exist;
int ret = 0;
ASSERT(block_group->length != 0);
- write_lock(&info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
exist = rb_find_add_cached(&block_group->cache_node,
- &info->block_group_cache_tree, btrfs_bg_start_cmp);
+ &fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
if (exist)
ret = -EEXIST;
- write_unlock(&info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
return ret;
}
@@ -584,7 +584,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
struct btrfs_root *extent_root;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret = 0;
@@ -626,7 +626,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
- btrfs_free_path(path);
return ret;
}
@@ -738,8 +737,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
path->reada = READA_FORWARD;
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -785,8 +784,8 @@ next:
if (key.objectid < last) {
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
btrfs_release_path(path);
goto next;
}
@@ -1457,6 +1456,32 @@ out:
}
/*
+ * Link the block_group to a list via bg_list.
+ *
+ * @bg: The block_group to link to the list.
+ * @list: The list to link it to.
+ *
+ * Use this rather than list_add_tail() directly to ensure proper respect
+ * to locking and refcounting.
+ *
+ * Returns: true if the bg was linked with a refcount bump and false otherwise.
+ */
+static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ bool added = false;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, list);
+ added = true;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return added;
+}
+
+/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
@@ -1571,8 +1596,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* drop under the "next" label for the
* fs_info->unused_bgs list.
*/
- btrfs_get_block_group(block_group);
- list_add_tail(&block_group->bg_list, &retry_list);
+ btrfs_link_bg_list(block_group, &retry_list);
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1823,7 +1847,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
- u64 reclaimed;
+ u64 used;
+ u64 reserved;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1887,6 +1912,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the block group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore. We also
+ * cache it before unlocking the block group, to prevent races
+ * (reports from KCSAN and such tools) with tasks updating it.
+ */
+ zone_unusable = bg->zone_unusable;
+
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
@@ -1903,31 +1939,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes dellaloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable",
bg->start,
- div64_u64(bg->used * 100, bg->length),
+ div64_u64(used * 100, bg->length),
+ div64_u64(reserved * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
- reclaimed = bg->used;
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
- reclaimed = 0;
+ used = 0;
+ reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -1936,24 +1988,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
- space_info->reclaim_bytes += reclaimed;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
spin_unlock(&space_info->lock);
next:
- if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
- /* Refcount held by the reclaim_bgs list after splice. */
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * This block group might be added to the unused list
- * during the above process. Move it back to the
- * reclaim list otherwise.
- */
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
- }
+ if (ret && !READ_ONCE(space_info->periodic_reclaim))
+ btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1993,13 +2034,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
+ if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
trace_btrfs_add_reclaim_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
@@ -2410,7 +2446,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
- ret = btrfs_add_block_group_cache(info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
@@ -2459,7 +2495,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = map->chunk_len;
bg->flags = map->type;
- ret = btrfs_add_block_group_cache(fs_info, bg);
+ ret = btrfs_add_block_group_cache(bg);
/*
* We may have some valid block group cache added already, in
* that case we skip to the next one.
@@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
return fill_dummy_bgs(info);
key.objectid = 0;
- key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
@@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
/*
* If the block group is still unused, add it to the list of
@@ -2888,7 +2926,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
ASSERT(cache->space_info);
- ret = btrfs_add_block_group_cache(fs_info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
@@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
}
#endif
- list_add_tail(&cache->bg_list, &trans->new_bgs);
+ btrfs_link_bg_list(cache, &trans->new_bgs);
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
@@ -3306,7 +3344,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
@@ -3323,7 +3361,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
}
- btrfs_free_path(path);
return 0;
}
@@ -3346,7 +3383,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int loops = 0;
@@ -3501,7 +3538,6 @@ out:
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
- btrfs_free_path(path);
return ret;
}
@@ -3512,7 +3548,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct list_head *io = &cur_trans->io_bgs;
path = btrfs_alloc_path();
@@ -3624,7 +3660,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
btrfs_put_block_group(cache);
}
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b2fa33911c28..4e2952cf5766 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -145,6 +145,7 @@ struct btrfs_inode {
* different from prop_compress and takes precedence if set.
*/
u8 defrag_compress;
+ s8 defrag_compress_level;
/*
* Lock for counters and all fields used to determine if the inode is in
@@ -516,6 +517,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
+static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ mapping_clear_stable_writes(inode->vfs_inode.i_mapping);
+ else
+ mapping_set_stable_writes(inode->vfs_inode.i_mapping);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
@@ -524,7 +533,7 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait);
@@ -584,9 +593,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path);
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0c4d486c3048..e7f8ee5d48a4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, unsigned int level)
+static struct list_head *alloc_workspace(int type, int level)
{
switch (type) {
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
@@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, unsigned int level)
+struct list_head *btrfs_get_workspace(int type, int level)
{
struct workspace_manager *wsm;
struct list_head *workspace;
@@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws)
* Adjust @level according to the limits of the compression algorithm or
* fallback to default
*/
-static unsigned int btrfs_compress_set_level(int type, unsigned level)
+static int btrfs_compress_set_level(unsigned int type, int level)
{
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
if (level == 0)
level = ops->default_level;
else
- level = min(level, ops->max_level);
+ level = min(max(level, ops->min_level), ops->max_level);
return level;
}
+/*
+ * Check whether the @level is within the valid range for the given type.
+ */
+bool btrfs_compress_level_valid(unsigned int type, int level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ return ops->min_level <= level && level <= ops->max_level;
+}
+
/* Wrapper around find_get_page(), with extra error message. */
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret)
@@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
- int type = btrfs_compress_type(type_level);
- int level = btrfs_compress_level(type_level);
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1590,18 +1598,19 @@ out:
/*
* Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level
+ * level, unrecognized string will set the default level. Negative level
+ * numbers are allowed.
*/
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str)
{
- unsigned int level = 0;
+ int level = 0;
int ret;
if (!type)
return 0;
if (str[0] == ':') {
- ret = kstrtouint(str + 1, 10, &level);
+ ret = kstrtoint(str + 1, 10, &level);
if (ret)
level = 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 954034086d0d..df198623cc08 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -72,16 +72,6 @@ struct compressed_bio {
struct btrfs_bio bbio;
};
-static inline unsigned int btrfs_compress_type(unsigned int type_level)
-{
- return (type_level & 0xF);
-}
-
-static inline unsigned int btrfs_compress_level(unsigned int type_level)
-{
- return ((type_level & 0xF0) >> 4);
-}
-
/* @range_end must be exclusive. */
static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
{
@@ -93,7 +83,8 @@ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+bool btrfs_compress_level_valid(unsigned int type, int level);
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
@@ -107,7 +98,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str);
struct folio *btrfs_alloc_compr_folio(void);
void btrfs_free_compr_folio(struct folio *folio);
@@ -131,14 +122,15 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, unsigned int level);
+struct list_head *btrfs_get_workspace(int type, int level);
void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
struct workspace_manager *workspace_manager;
/* Maximum level supported by the compression algorithm */
- unsigned int max_level;
- unsigned int default_level;
+ int min_level;
+ int max_level;
+ int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
@@ -187,9 +179,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+struct list_head *zstd_alloc_workspace(int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
+struct list_head *zstd_get_workspace(int level);
void zstd_put_workspace(struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3dc5a35dd19b..a2e7979372cc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4306,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 data_size)
{
int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
unsigned long ptr;
@@ -4320,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
write_extent_buffer(leaf, data, ptr, data_size);
btrfs_mark_buffer_dirty(trans, leaf);
}
- btrfs_free_path(path);
return ret;
}
@@ -4608,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u64 min_trans)
{
struct extent_buffer *cur;
- struct btrfs_key found_key;
int slot;
int sret;
u32 nritems;
@@ -4644,7 +4642,8 @@ again:
goto find_next_key;
ret = 0;
path->slots[level] = slot;
- btrfs_item_key_to_cpu(cur, &found_key, slot);
+ /* Save our key for returning back. */
+ btrfs_item_key_to_cpu(cur, min_key, slot);
goto out;
}
if (sret && slot > 0)
@@ -4668,8 +4667,8 @@ find_next_key:
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
+ path->slots[level] = slot;
if (slot >= nritems) {
- path->slots[level] = slot;
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -4679,11 +4678,10 @@ find_next_key:
goto out;
}
}
- /* save our key for returning back */
- btrfs_node_key_to_cpu(cur, &found_key, slot);
- path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
+ /* Save our key for returning back. */
+ btrfs_node_key_to_cpu(cur, min_key, slot);
goto out;
}
cur = btrfs_read_node_slot(cur, slot);
@@ -4700,10 +4698,8 @@ find_next_key:
}
out:
path->keep_locks = keep_locks;
- if (ret == 0) {
+ if (ret == 0)
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- memcpy(min_key, &found_key, sizeof(found_key));
- }
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1096a80a64e7..075a06db43a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,7 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include "linux/cleanup.h"
+#include <linux/cleanup.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 968dae953948..d4310d93f532 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -225,7 +225,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int ret = 0;
u64 cur = 0;
@@ -250,24 +250,24 @@ again:
goto cleanup;
}
- if (cur >= i_size_read(inode)) {
- iput(inode);
+ if (cur >= i_size_read(&inode->vfs_inode)) {
+ iput(&inode->vfs_inode);
goto cleanup;
}
/* Do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(ra, inode->vfs_inode.i_mapping);
sb_start_write(fs_info->sb);
ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
+ BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret < 0)
goto cleanup;
@@ -1352,17 +1352,18 @@ out:
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
* defragging all the range).
*/
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int compress_level = 0;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
@@ -1376,10 +1377,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
return -EINVAL;
if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
+ if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress.type) {
+ compress_type = range->compress.type;
+ compress_level = range->compress.level;
+ if (!btrfs_compress_level_valid(compress_type, compress_level))
+ return -EINVAL;
+ }
+ } else {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
}
if (extent_thresh == 0)
@@ -1402,8 +1414,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* defrag range can be written sequentially.
*/
start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
+ if (start_index < inode->vfs_inode.i_mapping->writeback_index)
+ inode->vfs_inode.i_mapping->writeback_index = start_index;
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
@@ -1420,27 +1432,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
cluster_end = min(cluster_end, last_byte);
- btrfs_inode_lock(BTRFS_I(inode), 0);
- if (IS_SWAPFILE(inode)) {
+ btrfs_inode_lock(inode, 0);
+ if (IS_SWAPFILE(&inode->vfs_inode)) {
ret = -ETXTBSY;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ if (do_compress) {
+ inode->defrag_compress = compress_type;
+ inode->defrag_compress_level = compress_level;
+ }
+ ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
newer_than, do_compress, &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
cur = max(cluster_end + 1, last_scanned);
@@ -1462,10 +1476,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* need to be written back immediately.
*/
if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
+ filemap_flush(inode->vfs_inode.i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
+ &inode->runtime_flags))
+ filemap_flush(inode->vfs_inode.i_mapping);
}
if (range->compress_type == BTRFS_COMPRESS_LZO)
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
@@ -1474,9 +1488,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
ret = sectors_defragged;
}
if (do_compress) {
- btrfs_inode_lock(BTRFS_I(inode), 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_lock(inode, 0);
+ inode->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
}
return ret;
}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 6b7596c4f0dc..a7f917a38dbf 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -6,14 +6,14 @@
#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct file_ra_state;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_ioctl_defrag_range_args;
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0b4933c6a889..3f1551d8a5c6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1211,7 +1211,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1238,7 +1238,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
btrfs_release_delayed_node(delayed_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
@@ -1817,53 +1816,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
+ struct inode *vfs_inode = &inode->vfs_inode;
u64 flags;
- btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
- btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
- btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
- btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
- btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
- btrfs_set_stack_inode_generation(inode_item,
- BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
+ btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
+ btrfs_set_stack_inode_generation(inode_item, inode->generation);
btrfs_set_stack_inode_sequence(inode_item,
- inode_peek_iversion(inode));
+ inode_peek_iversion(vfs_inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
- btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
- BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev);
+ flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags);
btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode_get_atime_sec(inode));
+ inode_get_atime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode_get_atime_nsec(inode));
+ inode_get_atime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode_get_mtime_sec(inode));
+ inode_get_mtime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode_get_mtime_nsec(inode));
+ inode_get_mtime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime_sec(inode));
+ inode_get_ctime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime_nsec(inode));
+ inode_get_ctime_nsec(vfs_inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec);
}
-int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
+ struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return -ENOENT;
@@ -1876,39 +1875,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode_item = &delayed_node->inode_item;
- i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
- i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_mode = btrfs_stack_inode_mode(inode_item);
- set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
- inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
- BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
-
- inode_set_iversion_queried(inode,
- btrfs_stack_inode_sequence(inode_item));
- inode->i_rdev = 0;
+ i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+ vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
+ inode->generation = btrfs_stack_inode_generation(inode_item);
+ inode->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item));
+ vfs_inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
- inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime),
btrfs_stack_timespec_nsec(&inode_item->atime));
- inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime),
btrfs_stack_timespec_nsec(&inode_item->mtime));
- inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
+ inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
- inode->i_generation = BTRFS_I(inode)->generation;
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ vfs_inode->i_generation = inode->generation;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -1928,8 +1926,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item,
- &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
goto release_node;
}
@@ -1937,7 +1934,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f4d9feac0d0e..c4b4ba122beb 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
-int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
/* Used for drop dead root */
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a35067cebb97..f5ae880308d3 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -14,6 +14,8 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+#include "messages.h"
struct btrfs_trans_handle;
struct btrfs_fs_info;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f86fbea0b3de..53d7d85cb4be 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct extent_buffer *eb;
int slot;
int ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
@@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
return 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found:
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
@@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
- goto out;
+ return 0;
}
slot = path->slots[0];
eb = path->nodes[0];
@@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
@@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
key.offset = 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
- goto out;
+ return ret;
}
}
@@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
-out:
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index ccf91de29f80..b29cc31a7c4a 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int data_size;
struct extent_buffer *leaf;
int slot;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
@@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
if (ret < 0)
- goto out;
+ return ret;
}
/* we found an item, look for our name in the item */
if (di) {
/* our exact name was found */
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
/* See if there is room in the item to insert this name. */
@@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
- ret = -EOVERFLOW;
- } else {
- /* plenty of insertion room */
- ret = 0;
+ return -EOVERFLOW;
}
-out:
- btrfs_free_path(path);
- return ret;
+
+ /* Plenty of insertion room. */
+ return 0;
}
/*
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 28d69970bc70..8462579a95f4 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -10,6 +10,7 @@ struct fscrypt_str;
struct btrfs_fs_info;
struct btrfs_key;
struct btrfs_path;
+struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 8567af46e16f..a374ce7a1813 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -248,7 +248,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
block_start = extent_map_block_start(em) + (start - em->start);
- if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
+ if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
+ false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -855,6 +856,22 @@ relock:
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
+ /*
+ * We can't control the folios being passed in, applications can write
+ * to them while a direct IO write is in progress. This means the
+ * content might change after we calculated the data checksum.
+ * Therefore we can end up storing a checksum that doesn't match the
+ * persisted data.
+ *
+ * To be extra safe and avoid false data checksum mismatch, if the
+ * inode requires data checksum, just fallback to buffered IO.
+ * For buffered IO we have full control of page cache and can ensure
+ * no one is modifying the content during writeback.
+ */
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
/*
* The iov_iter can be mapped to the same file range we are writing to.
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
index 3dc3ea926afe..df5d45ee6de7 100644
--- a/fs/btrfs/direct-io.h
+++ b/fs/btrfs/direct-io.h
@@ -5,6 +5,8 @@
#include <linux/types.h>
+struct kiocb;
+
int __init btrfs_init_dio(void);
void __cold btrfs_destroy_dio(void);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e815d165cccc..d6eef4bd9e9d 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -260,9 +254,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index dddb0f9101ba..2c5e85394092 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
+#include <linux/types.h>
#include <linux/sizes.h>
struct btrfs_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f09db62e61a1..1a916716cefe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,13 +182,12 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 start = max_t(u64, eb->start, folio_pos(folio));
u64 end = min_t(u64, eb->start + eb->len,
@@ -284,8 +283,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
- eb->start, eb->len)))
+ if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -1089,21 +1087,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key)
{
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
- btrfs_free_path(path);
return root;
}
/*
- * Initialize subvolume root in-memory structure
+ * Initialize subvolume root in-memory structure.
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ *
+ * In case of failure the caller is responsible to call btrfs_free_fs_root()
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
@@ -1127,7 +1126,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ return ret;
} else {
root->anon_dev = anon_dev;
}
@@ -1137,7 +1136,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto fail;
+ return ret;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1145,9 +1144,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
mutex_unlock(&root->objectid_mutex);
return 0;
-fail:
- /* The caller is responsible to call btrfs_free_fs_root */
- return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -2200,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
static int load_global_roots(struct btrfs_root *tree_root)
{
- struct btrfs_path *path;
- int ret = 0;
+ BTRFS_PATH_AUTO_FREE(path);
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -2210,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root)
ret = load_global_roots_objectid(tree_root, path,
BTRFS_EXTENT_TREE_OBJECTID, "extent");
if (ret)
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_CSUM_TREE_OBJECTID, "csum");
if (ret)
- goto out;
+ return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_FREE_SPACE_TREE_OBJECTID,
"free space");
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2447,21 +2442,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
/*
- * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
+ *
+ * For 4K page sized systems with non-debug builds, all 3 matches (4K).
+ * For 4K page sized systems with debug builds, there are two block sizes
+ * supported. (4K and 2K)
*
* We can support 16K sectorsize with 64K page size without problem,
* but such sectorsize/pagesize combination doesn't make much sense.
* 4K will be our future standard, PAGE_SIZE is supported from the very
* beginning.
*/
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
+ sectorsize != PAGE_SIZE &&
+ sectorsize != BTRFS_MIN_BLOCKSIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2561,6 +2562,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (ret)
+ return ret;
+
ret = validate_sys_chunk_array(fs_info, sb);
/*
@@ -3390,7 +3394,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
@@ -3416,11 +3419,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE)
- btrfs_warn(fs_info,
- "read-write for sector size %u with page size %lu is experimental",
- sectorsize, PAGE_SIZE);
-
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -4326,6 +4324,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4346,6 +4352,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
+
+ /*
* After we parked the cleaner kthread, ordered extents may have
* completed and created new delayed iputs. If one of the async reclaim
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
@@ -4369,6 +4400,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
btrfs_run_delayed_iputs(fs_info);
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
@@ -4403,9 +4436,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4528,10 +4558,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4902,7 +4928,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
@@ -4918,14 +4944,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
@@ -4936,10 +4961,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e2b22bea348a..7fc8a3200b40 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
- struct inode *inode;
+ struct btrfs_inode *inode;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation != 0 && generation != inode->i_generation) {
- iput(inode);
+ if (generation != 0 && generation != inode->vfs_inode.i_generation) {
+ iput(&inode->vfs_inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ return d_obtain_alias(&inode->vfs_inode);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = d_inode(child);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *dir = BTRFS_I(d_inode(child));
+ struct btrfs_inode *inode;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_root_ref *ref;
@@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (!path)
return ERR_PTR(-ENOMEM);
- if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
- key.objectid = btrfs_ino(BTRFS_I(dir));
+ key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
@@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(key.objectid, root));
+ inode = btrfs_iget(key.objectid, root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_obtain_alias(&inode->vfs_inode);
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -219,11 +224,11 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = d_inode(child);
- struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(child));
+ struct btrfs_inode *dir = BTRFS_I(d_inode(parent));
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
@@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name,
int ret;
u64 ino;
- if (!S_ISDIR(dir->i_mode))
+ if (!S_ISDIR(dir->vfs_inode.i_mode))
return -EINVAL;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
+ key.objectid = btrfs_root_id(inode->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
key.objectid = ino;
- key.offset = btrfs_ino(BTRFS_I(dir));
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = btrfs_ino(dir);
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- btrfs_free_path(path);
return ret;
} else if (ret > 0) {
- if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
path->slots[0]--;
- } else {
- btrfs_free_path(path);
+ else
return -ENOENT;
- }
}
leaf = path->nodes[0];
@@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
}
read_extent_buffer(leaf, name, name_ptr, name_len);
- btrfs_free_path(path);
/*
* have to add the null termination to make sure that reconnect_path
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6d08c100b01d..13de6af279e5 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -346,10 +346,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(const struct extent_io_tree *tree,
- const struct extent_state *state,
- const char *opname,
- int err)
+static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
btrfs_panic(extent_io_tree_to_fs_info(tree), err,
"extent io tree error on %s state start %llu end %llu",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3014a1a23efd..957230abd827 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, start);
- int ret;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = start;
- key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- btrfs_free_path(path);
- return ret;
+ key.offset = len;
+ return btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
/*
@@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 num_refs;
u64 extent_flags;
@@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
search_again:
key.objectid = bytenr;
- key.offset = offset;
if (metadata)
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = offset;
extent_root = btrfs_extent_root(fs_info, bytenr);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out_free;
+ return ret;
if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
@@ -159,7 +156,7 @@ search_again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -170,7 +167,7 @@ search_again:
"unexpected zero reference count for extent item (%llu %u %llu)",
key.objectid, key.type, key.offset);
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
extent_flags = btrfs_extent_flags(leaf, ei);
owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
@@ -216,8 +213,7 @@ search_again:
*flags = extent_flags;
if (owning_root)
*owning_root = owner;
-out_free:
- btrfs_free_path(path);
+
return ret;
}
@@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
@@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
- goto out;
+ return ret;
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, ret);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
u32 item_size;
@@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
@@ -1679,8 +1674,8 @@ again:
metadata = 0;
key.objectid = head->bytenr;
- key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = head->num_bytes;
goto again;
}
} else {
@@ -1688,7 +1683,7 @@ again:
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
head->bytenr, head->num_bytes, head->level);
- goto out;
+ return ret;
}
}
@@ -1701,13 +1696,12 @@ again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2348,8 +2342,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode,
int ret;
key.objectid = bytenr;
- key.offset = (u64)-1;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
@@ -2874,7 +2868,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
block_group->length,
&trimmed);
+ /*
+ * Not strictly necessary to lock, as the block_group should be
+ * read-only from btrfs_delete_unused_bgs().
+ */
+ ASSERT(block_group->ro);
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -5465,7 +5467,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_inline_ref *iref;
int ret;
bool exists = false;
@@ -5482,7 +5484,6 @@ again:
* If we get 0 then we found our reference, return 1, else
* return the error if it's not -ENOENT;
*/
- btrfs_free_path(path);
return (ret < 0 ) ? ret : 1;
}
@@ -5517,7 +5518,6 @@ again:
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
return exists ? 1 : 0;
}
@@ -6285,7 +6285,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct walk_control *wc;
int level;
int parent_level;
@@ -6298,10 +6298,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- if (!wc) {
- btrfs_free_path(path);
+ if (!wc)
return -ENOMEM;
- }
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
@@ -6338,7 +6336,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
}
kfree(wc);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index cfa52264f678..0ed682d9ed7b 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -4,7 +4,6 @@
#define BTRFS_EXTENT_TREE_H
#include <linux/types.h>
-#include "misc.h"
#include "block-group.h"
#include "locking.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b2fae67f8fa3..197f5e51c474 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -425,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_pos(folio) + folio_size(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio);
else
btrfs_folio_end_lock(fs_info, folio, start, len);
@@ -488,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio));
- btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}
/*
@@ -753,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
{
struct btrfs_inode *inode = folio_to_inode(folio);
- ASSERT(pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
@@ -836,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
if (!folio_test_private(folio))
folio_attach_private(folio, eb);
else
@@ -870,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
+ if (btrfs_is_subpage(fs_info, folio))
return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
@@ -887,8 +887,8 @@ void clear_folio_extent_mapped(struct folio *folio)
return;
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_detach_subpage(fs_info, folio);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
@@ -935,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
- const u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
- u64 block_start;
struct extent_map *em;
int ret = 0;
- size_t pg_offset = 0;
- size_t iosize;
- size_t blocksize = fs_info->sectorsize;
+ const size_t blocksize = fs_info->sectorsize;
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
@@ -955,25 +951,29 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
- if (zero_offset) {
- iosize = folio_size(folio) - zero_offset;
- folio_zero_range(folio, zero_offset, iosize);
- }
+ if (zero_offset)
+ folio_zero_range(folio, zero_offset,
+ folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
- while (cur <= end) {
+ for (u64 cur = start; cur <= end; cur += blocksize) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ unsigned long pg_offset = offset_in_folio(folio, cur);
bool force_bio_submit = false;
u64 disk_bytenr;
+ u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = folio_size(folio) - pg_offset;
- folio_zero_range(folio, pg_offset, iosize);
- end_folio_read(folio, true, cur, iosize);
+ folio_zero_range(folio, pg_offset, end - cur + 1);
+ end_folio_read(folio, true, cur, end - cur + 1);
break;
}
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ end_folio_read(folio, true, cur, blocksize);
+ continue;
+ }
em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
@@ -985,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
compress_type = extent_map_compression(em);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
disk_bytenr = extent_map_block_start(em) + extent_offset;
- block_start = extent_map_block_start(em);
+
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
+ else
+ block_start = extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1042,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- folio_zero_range(folio, pg_offset, iosize);
-
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ folio_zero_range(folio, pg_offset, blocksize);
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
@@ -1064,15 +1059,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
pg_offset);
- cur = cur + iosize;
- pg_offset += iosize;
}
-
return 0;
}
+/*
+ * Check if we can skip waiting the @ordered extent covering the block at @fileoff.
+ *
+ * @fileoff: Both input and output.
+ * Input as the file offset where the check should start at.
+ * Output as where the next check should start at,
+ * if the function returns true.
+ *
+ * Return true if we can skip to @fileoff. The caller needs to check the new
+ * @fileoff value to make sure it covers the full range, before skipping the
+ * full OE.
+ *
+ * Return false if we must wait for the ordered extent.
+ */
+static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 *fileoff)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio;
+ const u32 blocksize = fs_info->sectorsize;
+ u64 cur = *fileoff;
+ bool ret;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
+
+ /*
+ * We should have locked the folio(s) for range [start, end], thus
+ * there must be a folio and it must be locked.
+ */
+ ASSERT(!IS_ERR(folio));
+ ASSERT(folio_test_locked(folio));
+
+ /*
+ * There are several cases for the folio and OE combination:
+ *
+ * 1) Folio has no private flag
+ * The OE has all its IO done but not yet finished, and folio got
+ * invalidated.
+ *
+ * Have we have to wait for the OE to finish, as it may contain the
+ * to-be-inserted data checksum.
+ * Without the data checksum inserted into the csum tree, read will
+ * just fail with missing csum.
+ */
+ if (!folio_test_private(folio)) {
+ ret = false;
+ goto out;
+ }
+
+ /*
+ * 2) The first block is DIRTY.
+ *
+ * This means the OE is created by some other folios whose file pos is
+ * before this one. And since we are holding the folio lock, the writeback
+ * of this folio cannot start.
+ *
+ * We must skip the whole OE, because it will never start until we
+ * finished our folio read and unlocked the folio.
+ */
+ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ ret = true;
+ /*
+ * At least inside the folio, all the remaining blocks should
+ * also be dirty.
+ */
+ ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
+ *fileoff = ordered->file_offset + ordered->num_bytes;
+ goto out;
+ }
+
+ /*
+ * 3) The first block is uptodate.
+ *
+ * At least the first block can be skipped, but we are still not fully
+ * sure. E.g. if the OE has some other folios in the range that cannot
+ * be skipped.
+ * So we return true and update @next_ret to the OE/folio boundary.
+ */
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ /*
+ * The whole range to the OE end or folio boundary should also
+ * be uptodate.
+ */
+ ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
+ ret = true;
+ *fileoff = cur + range_len;
+ goto out;
+ }
+
+ /*
+ * 4) The first block is not uptodate.
+ *
+ * This means the folio is invalidated after the writeback was finished,
+ * but by some other operations (e.g. block aligned buffered write) the
+ * folio is inserted into filemap.
+ * Very much the same as case 1).
+ */
+ ret = false;
+out:
+ folio_put(folio);
+ return ret;
+}
+
+static bool can_skip_ordered_extent(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 start, u64 end)
+{
+ const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
+ u64 cur = max(start, ordered->file_offset);
+
+ while (cur < range_end) {
+ bool can_skip;
+
+ can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
+ if (!can_skip)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Locking helper to make sure we get a stable view of extent maps for the
+ * involved range.
+ *
+ * This is for folio read paths (read and readahead), thus the involved range
+ * should have all the folios locked.
+ */
+static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ u64 cur_pos;
+
+ /* Caller must provide a valid @cached_state. */
+ ASSERT(cached_state);
+
+ /* The range must at least be page aligned, as all read paths are folio based. */
+ ASSERT(IS_ALIGNED(start, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
+
+again:
+ lock_extent(&inode->io_tree, start, end, cached_state);
+ cur_pos = start;
+ while (cur_pos < end) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_range(inode, cur_pos,
+ end - cur_pos + 1);
+ /*
+ * No ordered extents in the range, and we hold the extent lock,
+ * no one can modify the extent maps in the range, we're safe to return.
+ */
+ if (!ordered)
+ break;
+
+ /* Check if we can skip waiting for the whole OE. */
+ if (can_skip_ordered_extent(inode, ordered, start, end)) {
+ cur_pos = min(ordered->file_offset + ordered->num_bytes,
+ end + 1);
+ btrfs_put_ordered_extent(ordered);
+ continue;
+ }
+
+ /* Now wait for the OE to finish. */
+ unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
+ btrfs_put_ordered_extent(ordered);
+ /* We have unlocked the whole range, restart from the beginning. */
+ goto again;
+ }
+}
+
int btrfs_read_folio(struct file *file, struct folio *folio)
{
struct btrfs_inode *inode = folio_to_inode(folio);
@@ -1083,7 +1253,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
struct extent_map *em_cached = NULL;
int ret;
- btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+ lock_extents_for_read(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
unlock_extent(&inode->io_tree, start, end, &cached_state);
@@ -1105,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
unsigned int start_bit;
unsigned int nbits;
- ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1118,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
- const unsigned int bitmap_size = fs_info->sectors_per_page;
+ const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
- ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1157,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
- const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
@@ -1184,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
- ASSERT(fs_info->sectors_per_page > 1);
+ if (btrfs_is_subpage(fs_info, folio)) {
+ ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
@@ -1264,7 +1435,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
btrfs_root_id(inode->root),
btrfs_ino(inode),
folio_pos(folio),
- fs_info->sectors_per_page,
+ blocks_per_folio,
&bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else {
@@ -1309,7 +1480,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
unsigned int bitmap_size = min(
(last_finished_delalloc_end - page_start) >>
fs_info->sectorsize_bits,
- fs_info->sectors_per_page);
+ blocks_per_folio);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
@@ -1324,7 +1495,7 @@ out:
delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
+ * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
@@ -1333,7 +1504,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1434,6 +1605,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
bool submitted_io = false;
bool error = false;
const u64 folio_start = folio_pos(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
@@ -1442,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
start + len <= folio_start + folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
- if (ret) {
+ if (ret == -EAGAIN) {
/* Fixup worker will requeue */
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
folio_unlock(folio);
return 1;
}
+ if (ret < 0)
+ return ret;
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- fs_info->sectors_per_page);
+ blocks_per_folio);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
@@ -1530,6 +1704,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
@@ -1551,6 +1726,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;
+
+ /*
+ * If the page is dirty but without private set, it's marked dirty
+ * without informing the fs.
+ * Nowadays that is a bug, since the introduction of
+ * pin_user_pages*().
+ *
+ * So here we check if the page has private set to rule out such
+ * case.
+ * But we also have a long history of relying on the COW fixup,
+ * so here we only enable this check for experimental builds until
+ * we're sure it's safe.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
+ unlikely(!folio_test_private(folio))) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), folio_pos(folio));
+ ret = -EUCLEAN;
+ goto done;
+ }
+
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
@@ -1562,14 +1761,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
goto done;
ret = extent_writepage_io(inode, folio, folio_pos(folio),
- PAGE_SIZE, bio_ctrl, i_size);
+ folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
if (ret < 0)
btrfs_err_rl(fs_info,
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
- folio_pos(folio), fs_info->sectors_per_page,
+ folio_pos(folio), blocks_per_folio,
&bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
@@ -1725,20 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock(
static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
struct folio_iter fi;
- u32 bio_offset = 0;
if (bbio->bio.bi_status != BLK_STS_OK)
set_btree_ioerr(eb);
bio_for_each_folio_all(fi, &bbio->bio) {
- u64 start = eb->start + bio_offset;
- struct folio *folio = fi.folio;
- u32 len = fi.length;
-
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- bio_offset += len;
+ btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1792,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
- if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
- bool ret;
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
folio_lock(folio);
- btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
- eb->len)) {
- folio_clear_dirty_for_io(folio);
- wbc->nr_to_write--;
- }
- ret = bio_add_folio(&bbio->bio, folio, eb->len,
- eb->start - folio_pos(folio));
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->len);
- folio_unlock(folio);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
- bool ret;
-
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ btrfs_meta_folio_clear_dirty(folio, eb);
+ btrfs_meta_folio_set_writeback(folio, eb);
+ if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
+ wbc_account_cgroup_owner(wbc, folio, range_len);
+ folio_unlock(folio);
}
btrfs_submit_bbio(bbio, 0);
}
@@ -1849,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
u64 folio_start = folio_pos(folio);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->sectors_per_page) {
+ while (bit_start < blocks_per_folio) {
struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
@@ -1867,7 +2043,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
+ if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&folio->mapping->i_private_lock);
@@ -1933,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct
if (!folio_test_private(folio))
return 0;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return submit_eb_subpage(folio, wbc);
spin_lock(&mapping->i_private_lock);
@@ -2192,10 +2368,8 @@ retry:
done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
- * the page lock: the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to
- * tmpfs file mapping
+ * the folio lock: the folio may be truncated or
+ * invalidated (changing folio->mapping to NULL).
*/
if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
@@ -2233,7 +2407,7 @@ retry:
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
- btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
+ btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2314,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
while (cur <= end) {
- u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
- u32 cur_len = cur_end + 1 - cur;
+ u64 cur_end;
+ u32 cur_len;
struct folio *folio;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2325,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* code is just in case, but shouldn't actually be run.
*/
if (IS_ERR(folio)) {
+ cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
- cur = cur_end + 1;
+ cur = cur_end;
continue;
}
+ cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end);
+ cur_len = cur_end + 1 - cur;
+
ASSERT(folio_test_locked(folio));
if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
@@ -2390,7 +2569,7 @@ void btrfs_readahead(struct readahead_control *rac)
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
- btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+ lock_extents_for_read(inode, start, end, &cached_state);
while ((folio = readahead_folio(rac)) != NULL)
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
@@ -2443,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
@@ -2481,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -2592,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
return;
}
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -2618,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2629,7 +2808,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
spin_unlock(&folio->mapping->i_private_lock);
}
@@ -2662,15 +2841,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len)
+static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
- eb->len = len;
+ eb->len = fs_info->nodesize;
eb->fs_info = fs_info;
init_rwsem(&eb->lock);
@@ -2679,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
- ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+ ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -2687,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
- int num_folios = num_extent_folios(src);
int ret;
- new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ new = __alloc_extent_buffer(src->fs_info, src->start);
if (new == NULL)
return NULL;
@@ -2707,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return NULL;
}
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(src); i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
@@ -2723,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return new;
}
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- int num_folios = 0;
int ret;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return NULL;
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto err;
+ goto out;
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto err;
+ goto out_detach;
}
set_extent_buffer_uptodate(eb);
@@ -2750,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
-err:
- for (int i = 0; i < num_folios; i++) {
+
+out_detach:
+ for (int i = 0; i < num_extent_folios(eb); i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
folio_put(eb->folios[i]);
}
}
+out:
kmem_cache_free(extent_buffer_cache, eb);
return NULL;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
-{
- return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
-}
-
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -2805,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_folios= num_extent_folios(eb);
-
check_buffer_tree_ref(eb);
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_mark_accessed(eb->folios[i]);
}
@@ -2842,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2881,8 +3050,11 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
struct folio *folio)
@@ -2896,7 +3068,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(fs_info))
return NULL;
/* Page not yet attached to an extent buffer */
@@ -2999,7 +3171,7 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
- if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
@@ -3041,8 +3213,6 @@ finish:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
- unsigned long len = fs_info->nodesize;
- int num_folios;
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
@@ -3070,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
@@ -3090,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (btrfs_meta_is_subpage(fs_info)) {
+ prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3106,9 +3276,8 @@ reallocate:
goto out;
}
- num_folios = num_extent_folios(eb);
/* Attach all pages to the filemap. */
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
@@ -3148,7 +3317,7 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+ WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/*
* Check if the current page is physically contiguous with previous eb
@@ -3159,7 +3328,7 @@ reallocate:
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
- if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
+ if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/*
@@ -3202,7 +3371,7 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_unlock(eb->folios[i]);
return eb;
@@ -3233,7 +3402,7 @@ out:
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing page->mapping.
+ * so it can be cleaned up without utilizing folio->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3333,11 +3502,10 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty(struct folio *folio)
+static void btree_clear_folio_dirty_tag(struct folio *folio)
{
- ASSERT(folio_test_dirty(folio));
+ ASSERT(!folio_test_dirty(folio));
ASSERT(folio_test_locked(folio));
- folio_clear_dirty_for_io(folio);
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
__xa_clear_mark(&folio->mapping->i_pages,
@@ -3345,26 +3513,10 @@ static void btree_clear_folio_dirty(struct folio *folio)
xa_unlock_irq(&folio->mapping->i_pages);
}
-static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct folio *folio = eb->folios[0];
- bool last;
-
- /* btree_clear_folio_dirty() needs page locked. */
- folio_lock(folio);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
- if (last)
- btree_clear_folio_dirty(folio);
- folio_unlock(folio);
- WARN_ON(atomic_read(&eb->refs) == 0);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios;
btrfs_assert_tree_write_locked(eb);
@@ -3391,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return clear_subpage_extent_buffer_dirty(eb);
-
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
+ bool last;
if (!folio_test_dirty(folio))
continue;
folio_lock(folio);
- btree_clear_folio_dirty(folio);
+ last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
+ if (last)
+ btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
@@ -3409,37 +3560,34 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/*
* For subpage case, we can have other extent buffers in the
- * same page, and in clear_subpage_extent_buffer_dirty() we
+ * same page, and in clear_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
- * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * Thankfully, clear_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
folio_lock(eb->folios[0]);
- for (int i = 0; i < num_folios; i++)
- btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
- eb->start, eb->len);
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_dirty(eb->folios[i], eb);
if (subpage)
folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
@@ -3447,54 +3595,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
if (!folio)
continue;
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_clear_uptodate(folio);
- else
- btrfs_subpage_clear_uptodate(fs_info, folio,
- eb->start, eb->len);
+ btrfs_meta_folio_clear_uptodate(folio, eb);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
-
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_mark_uptodate(folio);
- else
- btrfs_subpage_set_uptodate(fs_info, folio,
- eb->start, eb->len);
- }
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
}
static void clear_extent_buffer_reading(struct extent_buffer *eb)
@@ -3507,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb)
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct folio_iter fi;
- u32 bio_offset = 0;
/*
* If the extent buffer is marked UPTODATE before the read operation
@@ -3532,19 +3654,6 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_folio_all(fi, &bbio->bio) {
- struct folio *folio = fi.folio;
- u64 start = eb->start + bio_offset;
- u32 len = fi.length;
-
- if (uptodate)
- btrfs_folio_set_uptodate(fs_info, folio, start, len);
- else
- btrfs_folio_clear_uptodate(fs_info, folio, start, len);
-
- bio_offset += len;
- }
-
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
@@ -3555,7 +3664,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
- bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3595,19 +3703,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
- if (eb->fs_info->nodesize < PAGE_SIZE) {
- ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
- eb->start - folio_pos(eb->folios[0]));
- ASSERT(ret);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
}
btrfs_submit_bbio(bbio, mirror_num);
return 0;
@@ -3796,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
- if (fs_info->nodesize < PAGE_SIZE) {
+ if (btrfs_meta_is_subpage(fs_info)) {
folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
@@ -4282,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return try_release_subpage_extent_buffer(folio);
/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6c5328bfabc2..2e261892c7bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -252,8 +252,6 @@ void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
@@ -276,7 +274,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
-static inline int num_extent_pages(const struct extent_buffer *eb)
+/* Note: this can be used in for loops without caching the value in a variable. */
+static inline int __pure num_extent_pages(const struct extent_buffer *eb)
{
/*
* For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
@@ -294,8 +293,10 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
* As we can have either one large folio covering the whole eb
* (either nodesize <= PAGE_SIZE, or high order folio), or multiple
* single-paged folios.
+ *
+ * Note: this can be used in for loops without caching the value in a variable.
*/
-static inline int num_extent_folios(const struct extent_buffer *eb)
+static inline int __pure num_extent_folios(const struct extent_buffer *eb)
{
if (folio_order(eb->folios[0]))
return 1;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d04a3b47b1fb..344b4db487a0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
int ret = 0;
struct btrfs_file_extent_item *item;
struct btrfs_key file_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
file_key.objectid = objectid;
- file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = pos;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, item, 0);
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int cow = mod != 0;
file_key.objectid = objectid;
- file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = offset;
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
@@ -341,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
@@ -373,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
- if (!bbio->csum) {
- btrfs_free_path(path);
+ if (!bbio->csum)
return BLK_STS_RESOURCE;
- }
} else {
bbio->csum = bbio->csum_inline;
}
@@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
- btrfs_free_path(path);
return ret;
}
@@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
path->nowait = nowait;
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
@@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
while (1) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = end_byte - 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -1074,8 +1070,8 @@ again:
found_next = 0;
bytenr = sums->logical + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 0e13661a71f3..6181a70ec3ef 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,10 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/blk_types.h>
#include <linux/list.h>
#include <uapi/linux/btrfs_tree.h>
+#include "ctree.h"
#include "accessors.h"
struct extent_map;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0b568c8d24cb..262a707d8990 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
if (folio_test_uptodate(folio))
return 0;
if (!force_uptodate &&
- IS_ALIGNED(clamp_start, PAGE_SIZE) &&
- IS_ALIGNED(clamp_end, PAGE_SIZE))
+ IS_ALIGNED(clamp_start, blocksize) &&
+ IS_ALIGNED(clamp_end, blocksize))
return 0;
ret = btrfs_read_folio(NULL, folio);
@@ -874,7 +875,6 @@ again:
ret = PTR_ERR(folio);
return ret;
}
- folio_wait_writeback(folio);
/* Only support page sized folio yet. */
ASSERT(folio_order(folio) == 0);
ret = set_folio_extent_mapped(folio);
@@ -1014,8 +1014,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, nowait);
+ ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@@ -1783,6 +1782,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
+ size_t fsize = folio_size(folio);
vm_fault_t ret;
int ret2;
int reserved = 0;
@@ -1793,7 +1793,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ASSERT(folio_order(folio) == 0);
- reserved_space = PAGE_SIZE;
+ reserved_space = fsize;
sb_start_pagefault(inode->i_sb);
page_start = folio_pos(folio);
@@ -1847,7 +1847,7 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
@@ -1859,11 +1859,11 @@ again:
if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
+ if (reserved_space < fsize) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
+ fsize - reserved_space, true);
}
}
@@ -1890,12 +1890,12 @@ again:
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
- zero_start = PAGE_SIZE;
+ zero_start = fsize;
- if (zero_start != PAGE_SIZE)
+ if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
@@ -1904,7 +1904,7 @@ again:
unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -1913,7 +1913,7 @@ out_unlock:
folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index de89e644be29..d7df81388cbe 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -9,6 +9,8 @@ struct file;
struct extent_state;
struct kiocb;
struct iov_iter;
+struct inode;
+struct folio;
struct page;
struct btrfs_ioctl_encoded_io_args;
struct btrfs_drop_extents_args;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d42b6f882f57..05e173311c1a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode;
unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
- mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_constraint(inode->i_mapping,
+ mapping_set_gfp_mask(inode->vfs_inode.i_mapping,
+ mapping_gfp_constraint(inode->vfs_inode.i_mapping,
~(__GFP_FS | __GFP_HIGHMEM)));
- return inode;
+ return &inode->vfs_inode;
}
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
@@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
if (PTR_ERR(inode) != -ENOENT)
ret = PTR_ERR(inode);
- goto out;
+ return ret;
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(BTRFS_I(inode));
- goto out;
+ return ret;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = 0;
- goto out;
+ return ret;
}
- ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, trans->fs_info->tree_root, path);
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
@@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
- struct page *page;
+ struct folio *folio;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
for (i = 0; i < io_ctl->num_pages; i++) {
int ret;
- page = find_or_create_page(inode->i_mapping, i, mask);
- if (!page) {
+ folio = __filemap_get_folio(inode->i_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
+ if (IS_ERR(folio)) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
- ret = set_folio_extent_mapped(page_folio(page));
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
io_ctl_drop_pages(io_ctl);
return ret;
}
- io_ctl->pages[i] = page;
- if (uptodate && !PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ io_ctl->pages[i] = &folio->page;
+ if (uptodate && !folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"free space cache page truncated");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
io_ctl_drop_pages(io_ctl);
@@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -1156,8 +1155,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index cae540ec15ed..39c6b96a4c25 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1062,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path, *path2;
+ BTRFS_PATH_AUTO_FREE(path);
+ BTRFS_PATH_AUTO_FREE(path2);
struct btrfs_key key;
u64 start, end;
int ret;
@@ -1070,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
- if (!path2) {
- btrfs_free_path(path);
+ if (!path2)
return -ENOMEM;
- }
+
+ path->reada = READA_FORWARD;
ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
- goto out;
+ return ret;
mutex_lock(&block_group->free_space_lock);
@@ -1146,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = 0;
out_locked:
mutex_unlock(&block_group->free_space_lock);
-out:
- btrfs_free_path(path2);
- btrfs_free_path(path);
+
return ret;
}
@@ -1217,7 +1215,7 @@ out_clear:
static int clear_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int nr;
int ret;
@@ -1233,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
nr = btrfs_header_nritems(path->nodes[0]);
if (!nr)
@@ -1242,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1638,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u32 extent_count, flags;
- int ret;
block_group = caching_ctl->block_group;
@@ -1657,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
path->reada = READA_FORWARD;
info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1670,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* there.
*/
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
- ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ return load_free_space_bitmaps(caching_ctl, path, extent_count);
else
- ret = load_free_space_extents(caching_ctl, path, extent_count);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return load_free_space_extents(caching_ctl, path, extent_count);
}
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 09cfb43580cb..b2bb86f8d7cf 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
-#include "ctree.h"
#include "fs.h"
#include "accessors.h"
#include "volumes.h"
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b572d6b9730b..bcca43046064 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -47,6 +47,18 @@ struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;
+/*
+ * Minimum data and metadata block size.
+ *
+ * Normally it's 4K, but for testing subpage block size on 4K page systems, we
+ * allow DEBUG builds to accept 2K page size.
+ */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_MIN_BLOCKSIZE (SZ_2K)
+#else
+#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#endif
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -105,6 +117,9 @@ enum {
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ /* No more delayed iput can be queued. */
+ BTRFS_FS_STATE_NO_DELAYED_IPUT,
+
BTRFS_FS_STATE_COUNT
};
@@ -485,8 +500,8 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
- unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_type;
+ int compress_level;
u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
@@ -709,7 +724,6 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
struct btrfs_discard_ctl discard_ctl;
@@ -981,6 +995,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz
return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}
+static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info,
+ const struct folio *folio)
+{
+ return folio_size(folio) >> fs_info->sectorsize_bits;
+}
+
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 448aa1a682d6..3530de0618c8 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -191,8 +191,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -317,8 +317,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -493,8 +493,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
key.objectid = control->ino;
- key.offset = (u64)-1;
key.type = (u8)-1;
+ key.offset = (u64)-1;
search_again:
/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 38756f8cef46..cc67d1a2d611 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t datasize;
key.objectid = btrfs_ino(inode);
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (offset != 0)
return false;
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (fs_info->sectorsize != PAGE_SIZE)
- return false;
-
/* Inline extents are limited to sectorsize. */
if (size > fs_info->sectorsize)
return false;
+ /* We do not allow a non-compressed extent to be as large as block size. */
+ if (data_len >= fs_info->sectorsize)
+ return false;
+
/* We cannot exceed the maximum inline data size. */
if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return false;
@@ -672,7 +663,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -832,7 +823,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(inode, small_write);
}
-static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
unsigned long end_index = end >> PAGE_SHIFT;
struct folio *folio;
@@ -840,13 +831,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- folio = filemap_get_folio(inode->i_mapping, index);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
}
@@ -886,6 +877,7 @@ static void compress_file_range(struct btrfs_work *work)
unsigned int poff;
int i;
int compress_type = fs_info->compress_type;
+ int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -894,7 +886,7 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(inode, start, end);
/*
* All the folios should have been locked thus no failure.
@@ -968,13 +960,15 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress)
+ if (inode->defrag_compress) {
compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
+ compress_level = inode->defrag_compress_level;
+ } else if (inode->prop_compress) {
compress_type = inode->prop_compress;
+ }
/* Compression level is applied here. */
- ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ ret = btrfs_compress_folios(compress_type, compress_level,
mapping, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
@@ -1090,7 +1084,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
if (locked_folio)
btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
start, async_extent->ram_size);
@@ -1272,10 +1265,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
- * while-loop, the ordered extents created in previous iterations are kept
- * intact. So, the caller must clean them up by calling
- * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
- * example.
+ * while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
@@ -1492,11 +1482,9 @@ out_unlock:
/*
* For the range (1). We have already instantiated the ordered extents
- * for this region. They are cleaned up by
- * btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range().
+ * for this region, thus we need to cleanup those ordered extents.
* EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
- * are also handled by the cleanup function.
+ * are also handled by the ordered extents cleanup.
*
* So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
* finish the writeback of the involved folios, which will be never submitted.
@@ -1507,6 +1495,8 @@ out_unlock:
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
+
+ btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
locked_folio, NULL, clear_bits, page_ops);
}
@@ -1976,6 +1966,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode,
mapping_set_error(mapping, error);
}
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1 << BTRFS_ORDERED_PREALLOC)
+ : (1 << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+ /*
+ * On error, we need to cleanup the ordered extents we created.
+ *
+ * We do not clear the folio Dirty flags because they are set and
+ * cleaered by the caller.
+ */
+ if (ret < 0)
+ btrfs_cleanup_ordered_extents(inode, file_pos, end);
+ return ret;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -2020,15 +2069,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2154,75 +2200,21 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1);
- cow_start = (u64)-1;
if (ret) {
cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
+ cow_start = (u64)-1;
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -2231,11 +2223,11 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
- cow_start = (u64)-1;
if (ret) {
cow_end = end;
goto error;
}
+ cow_start = (u64)-1;
}
btrfs_free_path(path);
@@ -2249,27 +2241,44 @@ error:
* start cur_offset end
* |/////////////| |
*
+ * In this case, cow_start should be (u64)-1.
+ *
* For range [start, cur_offset) the folios are already unlocked (except
* @locked_folio), EXTENT_DELALLOC already removed.
- * Only need to clear the dirty flag as they will never be submitted.
- * Ordered extent and extent maps are handled by
- * btrfs_mark_ordered_io_finished() inside run_delalloc_range().
+ * Need to clear the dirty flags and finish the ordered extents.
+ *
+ * 2) Failed with error before calling fallback_to_cow()
+ *
+ * start cow_start end
+ * |/////////////| |
+ *
+ * In this case, only @cow_start is set, @cur_offset is between
+ * [cow_start, end)
*
- * 2) Failed with error from fallback_to_cow()
- * start cur_offset cow_end end
+ * It's mostly the same as case 1), just replace @cur_offset with
+ * @cow_start.
+ *
+ * 3) Failed with error from fallback_to_cow()
+ *
+ * start cow_start cow_end end
* |/////////////|-----------| |
*
- * For range [start, cur_offset) it's the same as case 1).
- * But for range [cur_offset, cow_end), the folios have dirty flag
- * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range().
+ * In this case, both @cow_start and @cow_end is set.
*
- * Thus we should not call extent_clear_unlock_delalloc() on range
- * [cur_offset, cow_end), as the folios are already unlocked.
+ * For range [start, cow_start) it's the same as case 1).
+ * But for range [cow_start, cow_end), all the cleanup is handled by
+ * cow_file_range(), we should not touch anything in that range.
*
- * So clear the folio dirty flags for [start, cur_offset) first.
+ * So for all above cases, if @cow_start is set, cleanup ordered extents
+ * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
*/
- if (cur_offset > start)
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+
+ if (cur_offset > start) {
+ btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+ }
/*
* If an error happened while a COW region is outstanding, cur_offset
@@ -2334,7 +2343,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
- goto out;
+ return ret;
}
if (btrfs_inode_can_compress(inode) &&
@@ -2348,10 +2357,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
else
ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
-
-out:
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
return ret;
}
@@ -2878,6 +2883,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
return 0;
/*
+ * For experimental build, we error out instead of EAGAIN.
+ *
+ * We should not hit such out-of-band dirty folios anymore.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)),
+ folio_pos(folio));
+ return -EUCLEAN;
+ }
+
+ /*
* folio_checked is set below when we create a fixup worker for this
* folio, don't try to create another one if we're already
* folio_test_checked.
@@ -2896,7 +2916,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the folio lock, and we can't trust
- * page->mapping outside of the folio lock.
+ * folio->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
@@ -2952,8 +2972,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
+ ins.offset = file_pos;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
@@ -2988,8 +3008,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
- ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = disk_num_bytes;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
@@ -3407,6 +3427,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
+ WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs);
/*
* Need to be irq safe here because we can be called from either an irq
@@ -3527,7 +3548,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
- struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
@@ -3546,6 +3566,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -3669,10 +3691,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (!inode || inode->i_nlink) {
+ if (!inode || inode->vfs_inode.i_nlink) {
if (inode) {
- ret = btrfs_drop_verity_items(BTRFS_I(inode));
- iput(inode);
+ ret = btrfs_drop_verity_items(inode);
+ iput(&inode->vfs_inode);
inode = NULL;
if (ret)
goto out;
@@ -3695,7 +3717,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
nr_unlink++;
/* this will do delete_inode and everything for us */
- iput(inode);
+ iput(&inode->vfs_inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3845,12 +3867,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
*
* On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
+static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode *vfs_inode = &inode->vfs_inode;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
@@ -3859,7 +3882,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ ret = btrfs_init_file_extent_tree(inode);
if (ret)
goto out;
@@ -3869,7 +3892,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
ASSERT(path);
- btrfs_get_inode_key(BTRFS_I(inode), &location);
+ btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3889,41 +3912,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
- i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
-
- inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+
+ inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
btrfs_timespec_nsec(leaf, &inode_item->mtime));
- inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
+ inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
- inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
- BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
+ inode->generation = btrfs_inode_generation(leaf, inode_item);
+ inode->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode_set_iversion_queried(inode,
- btrfs_inode_sequence(leaf, inode_item));
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
+ inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
+ vfs_inode->i_generation = inode->generation;
+ vfs_inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
+ btrfs_update_inode_mapping_flags(inode);
cache_index:
/*
@@ -3935,9 +3958,8 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
@@ -3966,7 +3988,7 @@ cache_index:
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
- BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_unlink_trans = inode->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
@@ -3974,15 +3996,15 @@ cache_index:
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
- BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_reflink_trans = inode->last_trans;
path->slots[0]++;
- if (inode->i_nlink != 1 ||
+ if (vfs_inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
- if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ if (location.objectid != btrfs_ino(inode))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3990,13 +4012,12 @@ cache_index:
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ inode->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
- extref);
+ inode->dir_index = btrfs_inode_extref_index(leaf, extref);
}
cache_acl:
/*
@@ -4004,50 +4025,49 @@ cache_acl:
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ btrfs_ino(inode), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)),
- btrfs_root_id(root), ret);
+ btrfs_ino(inode), btrfs_root_id(root), ret);
}
if (!maybe_acls)
- cache_no_acl(inode);
+ cache_no_acl(vfs_inode);
- switch (inode->i_mode & S_IFMT) {
+ switch (vfs_inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_fop = &btrfs_file_operations;
+ vfs_inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
- inode->i_fop = &btrfs_dir_file_operations;
- inode->i_op = &btrfs_dir_inode_operations;
+ vfs_inode->i_fop = &btrfs_dir_file_operations;
+ vfs_inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(vfs_inode);
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
+ vfs_inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ ret = btrfs_add_inode_to_root(inode, true);
if (ret)
goto out;
return 0;
out:
- iget_failed(inode);
+ iget_failed(vfs_inode);
return ret;
}
@@ -5602,7 +5622,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5614,40 +5634,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
- return inode;
+ if (!inode)
+ return NULL;
+ return BTRFS_I(inode);
}
/*
* Get an inode object given its inode number and corresponding root. Path is
* preallocated to prevent recursing back to iget through allocator.
*/
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path)
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
int ret;
inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
/*
* Get an inode object given its inode number and corresponding root.
*/
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_path *path;
int ret;
@@ -5655,7 +5677,7 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
path = btrfs_alloc_path();
@@ -5667,43 +5689,46 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
-static struct inode *new_simple_dir(struct inode *dir,
- struct btrfs_key *key,
- struct btrfs_root *root)
+static struct btrfs_inode *new_simple_dir(struct inode *dir,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
{
struct timespec64 ts;
- struct inode *inode = new_inode(dir->i_sb);
+ struct inode *vfs_inode;
+ struct btrfs_inode *inode;
- if (!inode)
+ vfs_inode = new_inode(dir->i_sb);
+ if (!vfs_inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = btrfs_grab_root(root);
- BTRFS_I(inode)->ref_root_id = key->objectid;
- set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
- set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ inode = BTRFS_I(vfs_inode);
+ inode->root = btrfs_grab_root(root);
+ inode->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
+ set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
- btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
+ btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
- inode->i_op = &simple_dir_inode_operations;
- inode->i_opflags &= ~IOP_XATTR;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ vfs_inode->i_op = &simple_dir_inode_operations;
+ vfs_inode->i_opflags &= ~IOP_XATTR;
+ vfs_inode->i_fop = &simple_dir_operations;
+ vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- ts = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, ts);
- inode_set_atime_to_ts(inode, inode_get_atime(dir));
- BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
- BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+ ts = inode_set_ctime_current(vfs_inode);
+ inode_set_mtime_to_ts(vfs_inode, ts);
+ inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
+ inode->i_otime_sec = ts.tv_sec;
+ inode->i_otime_nsec = ts.tv_nsec;
- inode->i_uid = dir->i_uid;
- inode->i_gid = dir->i_gid;
+ vfs_inode->i_uid = dir->i_uid;
+ vfs_inode->i_gid = dir->i_gid;
return inode;
}
@@ -5717,15 +5742,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
-static inline u8 btrfs_inode_type(struct inode *inode)
+static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
{
- return fs_umode_to_ftype(inode->i_mode);
+ return fs_umode_to_ftype(inode->vfs_inode.i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location = { 0 };
@@ -5742,18 +5767,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
if (btrfs_inode_type(inode) != di_type) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
- inode->i_mode, btrfs_inode_type(inode),
+ inode->vfs_inode.i_mode, btrfs_inode_type(inode),
di_type);
- iput(inode);
+ iput(&inode->vfs_inode);
return ERR_PTR(-EUCLEAN);
}
- return inode;
+ return &inode->vfs_inode;
}
ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
@@ -5768,19 +5793,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
btrfs_put_root(sub_root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
down_read(&fs_info->cleanup_work_sem);
- if (!sb_rdonly(inode->i_sb))
+ if (!sb_rdonly(inode->vfs_inode.i_sb))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&fs_info->cleanup_work_sem);
if (ret) {
- iput(inode);
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
}
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return &inode->vfs_inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
@@ -6253,7 +6281,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6339,6 +6367,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
+ btrfs_update_inode_mapping_flags(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6432,7 +6461,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
path = NULL;
if (args->subvol) {
- struct inode *parent;
+ struct btrfs_inode *parent;
/*
* Subvolumes inherit properties from their parent subvolume,
@@ -6442,11 +6471,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
- ret = btrfs_inode_inherit_props(trans, inode, parent);
- iput(parent);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ parent);
+ iput(&parent->vfs_inode);
}
} else {
- ret = btrfs_inode_inherit_props(trans, inode, dir);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ BTRFS_I(dir));
}
if (ret) {
btrfs_err(fs_info,
@@ -6544,7 +6575,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
- btrfs_inode_type(&inode->vfs_inode), index);
+ btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
@@ -6744,18 +6775,18 @@ fail:
return err;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6764,6 +6795,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 blocksize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6780,7 +6812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_SIZE, max_size);
+ max_size = min_t(unsigned long, blocksize, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
max_size);
@@ -6792,14 +6824,15 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+ if (max_size < blocksize)
+ folio_zero_range(folio, max_size, blocksize - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
+ const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
@@ -6814,14 +6847,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
return uncompress_inline(path, folio, fi);
- copy_size = min_t(u64, PAGE_SIZE,
+ copy_size = min_t(u64, blocksize,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < blocksize)
+ folio_zero_range(folio, copy_size, blocksize - copy_size);
return 0;
}
@@ -7062,17 +7095,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
int found_type;
@@ -7082,8 +7115,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
return -ENOMEM;
path->nowait = nowait;
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), offset, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
if (ret < 0)
goto out;
@@ -7098,7 +7131,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
goto out;
@@ -7119,7 +7152,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.end = offset + *len - 1;
nocow_args.free_path = true;
- ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
/* can_nocow_file_extent() has freed the path. */
path = NULL;
@@ -7135,7 +7168,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.file_extent.offset))
goto out;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
@@ -7240,7 +7273,7 @@ static void wait_subpage_spinlock(struct folio *folio)
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -7264,7 +7297,7 @@ static void wait_subpage_spinlock(struct folio *folio)
static int btrfs_launder_folio(struct folio *folio)
{
return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
- PAGE_SIZE, NULL);
+ folio_size(folio), NULL);
}
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -8499,8 +8532,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
- struct btrfs_inode *binode;
- struct inode *inode;
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
@@ -8511,30 +8542,30 @@ static int start_delalloc_inodes(struct btrfs_root *root,
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- binode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ struct btrfs_inode *inode;
+ struct inode *tmp_inode;
+
+ inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes);
- list_move_tail(&binode->delalloc_inodes,
- &root->delalloc_inodes);
+ list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
if (in_reclaim_context &&
- test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
continue;
- inode = igrab(&binode->vfs_inode);
- if (!inode) {
+ tmp_inode = igrab(&inode->vfs_inode);
+ if (!tmp_inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
- set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
- &binode->runtime_flags);
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
if (full_flush) {
- work = btrfs_alloc_delalloc_work(inode);
+ work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
if (!work) {
- iput(inode);
+ iput(&inode->vfs_inode);
ret = -ENOMEM;
goto out;
}
@@ -8542,8 +8573,8 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -8660,7 +8691,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ /*
+ * Symlinks utilize uncompressed inline extent data, which should not
+ * reach block size.
+ */
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ name_len >= fs_info->sectorsize)
return -ENAMETOOLONG;
inode = new_inode(dir->i_sb);
@@ -8699,8 +8735,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
@@ -9146,7 +9182,7 @@ out:
}
struct btrfs_encoded_read_private {
- struct completion done;
+ struct completion *sync_reads;
void *uring_ctx;
refcount_t pending_refs;
blk_status_t status;
@@ -9158,11 +9194,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
if (bbio->bio.bi_status) {
/*
- * The memory barrier implied by the atomic_dec_return() here
- * pairs with the memory barrier implied by the
- * atomic_dec_return() or io_wait_event() in
- * btrfs_encoded_read_regular_fill_pages() to ensure that this
- * write is observed before the load of status in
+ * The memory barrier implied by the refcount_dec_and_test() here
+ * pairs with the memory barrier implied by the refcount_dec_and_test()
+ * in btrfs_encoded_read_regular_fill_pages() to ensure that
+ * this write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
@@ -9174,7 +9209,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
- complete(&priv->done);
+ complete(priv->sync_reads);
}
}
bio_put(&bbio->bio);
@@ -9185,16 +9220,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private *priv;
+ struct btrfs_encoded_read_private *priv, sync_priv;
+ struct completion sync_reads;
unsigned long i = 0;
struct btrfs_bio *bbio;
int ret;
- priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
- if (!priv)
- return -ENOMEM;
+ /*
+ * Fast path for synchronous reads which completes in this call, io_uring
+ * needs longer time span.
+ */
+ if (uring_ctx) {
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+ } else {
+ priv = &sync_priv;
+ init_completion(&sync_reads);
+ priv->sync_reads = &sync_reads;
+ }
- init_completion(&priv->done);
refcount_set(&priv->pending_refs, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
@@ -9237,11 +9282,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
return -EIOCBQUEUED;
} else {
if (!refcount_dec_and_test(&priv->pending_refs))
- wait_for_completion_io(&priv->done);
+ wait_for_completion_io(&sync_reads);
/* See btrfs_encoded_read_endio() for ordering. */
- ret = blk_status_to_errno(READ_ONCE(priv->status));
- kfree(priv);
- return ret;
+ return blk_status_to_errno(READ_ONCE(priv->status));
}
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6c18bad53cd3..a13d81bb56a0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 {
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
- unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
+ unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
return flags;
@@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
+static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
{
unsigned int iflags = 0;
- u32 flags = binode->flags;
- u32 ro_flags = binode->ro_flags;
+ u32 flags = inode->flags;
+ u32 ro_flags = inode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
{
- struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (binode->flags & BTRFS_INODE_SYNC)
+ if (inode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (binode->flags & BTRFS_INODE_IMMUTABLE)
+ if (inode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (binode->flags & BTRFS_INODE_APPEND)
+ if (inode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (binode->flags & BTRFS_INODE_NOATIME)
+ if (inode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (binode->flags & BTRFS_INODE_DIRSYNC)
+ if (inode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
- if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
new_fl |= S_VERITY;
- set_mask_bits(&inode->i_flags,
+ set_mask_bits(&inode->vfs_inode.i_flags,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
S_VERITY, new_fl);
}
@@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
-static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
unsigned int flags)
{
if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
@@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
*/
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
- struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+ const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
return 0;
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags;
+ u32 inode_flags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
- fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(inode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (ret)
return ret;
- binode_flags = binode->flags;
+ inode_flags = inode->flags;
if (fsflags & FS_SYNC_FL)
- binode_flags |= BTRFS_INODE_SYNC;
+ inode_flags |= BTRFS_INODE_SYNC;
else
- binode_flags &= ~BTRFS_INODE_SYNC;
+ inode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
- binode_flags |= BTRFS_INODE_IMMUTABLE;
+ inode_flags |= BTRFS_INODE_IMMUTABLE;
else
- binode_flags &= ~BTRFS_INODE_IMMUTABLE;
+ inode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
- binode_flags |= BTRFS_INODE_APPEND;
+ inode_flags |= BTRFS_INODE_APPEND;
else
- binode_flags &= ~BTRFS_INODE_APPEND;
+ inode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
- binode_flags |= BTRFS_INODE_NODUMP;
+ inode_flags |= BTRFS_INODE_NODUMP;
else
- binode_flags &= ~BTRFS_INODE_NODUMP;
+ inode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
- binode_flags |= BTRFS_INODE_NOATIME;
+ inode_flags |= BTRFS_INODE_NOATIME;
else
- binode_flags &= ~BTRFS_INODE_NOATIME;
+ inode_flags &= ~BTRFS_INODE_NOATIME;
/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if (!fa->flags_valid) {
@@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
if (fsflags & FS_DIRSYNC_FL)
- binode_flags |= BTRFS_INODE_DIRSYNC;
+ inode_flags |= BTRFS_INODE_DIRSYNC;
else
- binode_flags &= ~BTRFS_INODE_DIRSYNC;
+ inode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
- if (inode->i_size == 0)
- binode_flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
} else {
- binode_flags |= BTRFS_INODE_NODATACOW;
+ inode_flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
* Revert back under same assumptions as above
*/
- if (S_ISREG(inode->i_mode)) {
- if (inode->i_size == 0)
- binode_flags &= ~(BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM);
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
} else {
- binode_flags &= ~BTRFS_INODE_NODATACOW;
+ inode_flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
- binode_flags &= ~BTRFS_INODE_COMPRESS;
- binode_flags |= BTRFS_INODE_NOCOMPRESS;
+ inode_flags &= ~BTRFS_INODE_COMPRESS;
+ inode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode))
+ if (IS_SWAPFILE(&inode->vfs_inode))
return -ETXTBSY;
- binode_flags |= BTRFS_INODE_COMPRESS;
- binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode_flags |= BTRFS_INODE_COMPRESS;
+ inode_flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
- binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
/*
@@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
- NULL, 0, 0);
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
update_flags:
- binode->flags = binode_flags;
+ inode->flags = inode_flags;
+ btrfs_update_inode_mapping_flags(inode);
btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
out_end_trans:
btrfs_end_transaction(trans);
return ret;
}
-static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
}
@@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
-static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
@@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
- key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
@@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *child)
+ struct inode *dir, const struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
@@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
- char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (!strcmp(sizestr, "max"))
new_size = bdev_nr_bytes(device->bdev);
else {
+ char *retptr;
+
if (sizestr[0] == '-') {
mod = -1;
sizestr++;
@@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = round_down(new_size, fs_info->sectorsize);
if (new_size > old_size) {
+ struct btrfs_trans_handle *trans;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -1336,15 +1335,15 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
+static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u64 flags = 0;
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&fs_info->subvol_sem);
@@ -1447,8 +1446,8 @@ out:
return ret;
}
-static noinline int key_in_sk(struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk)
+static noinline int key_in_sk(const struct btrfs_key *key,
+ const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
@@ -1473,7 +1472,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk,
+ const struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
@@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
}
sh.objectid = key->objectid;
- sh.offset = key->offset;
sh.type = key->type;
+ sh.offset = key->offset;
sh.len = item_len;
sh.transid = found_transid;
@@ -1604,13 +1603,12 @@ out:
return ret;
}
-static noinline int search_ioctl(struct inode *inode,
+static noinline int search_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = inode_to_fs_info(inode);
- struct btrfs_root *root;
+ struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
int ret;
@@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode,
return -ENOMEM;
if (sk->tree_id == 0) {
- /* search the root of the inode that was passed */
- root = btrfs_grab_root(BTRFS_I(inode)->root);
+ /* Search the root that we got passed. */
+ root = btrfs_grab_root(root);
} else {
+ /* Look up the root from the arguments. */
root = btrfs_get_fs_root(info, sk->tree_id, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
@@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = -EFAULT;
/*
* Ensure that the whole user buffer is faulted in at sub-page
* granularity, otherwise the loop may live-lock.
*/
- if (fault_in_subpage_writeable(ubuf + sk_offset,
- *buf_size - sk_offset))
+ if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
+ ret = -EFAULT;
break;
+ }
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- goto err;
- }
+ if (ret)
+ break;
+
ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
@@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode,
break;
}
+ /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
if (ret > 0)
ret = 0;
-err:
+
sk->nr_items = num_found;
btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs = argp;
@@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
buf_size = sizeof(uargs->buf);
- ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+ ret = search_ioctl(root, &sk, &buf_size, uargs->buf);
/*
* In the origin implementation an overflow is handled by returning a
@@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
@@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
if (buf_size > buf_limit)
buf_size = buf_limit;
- ret = search_ioctl(inode, &args.key, &buf_size,
+ ret = search_ioctl(root, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
@@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
- struct inode *temp_inode;
char *ptr;
int slot;
int len;
@@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *temp_inode;
+
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out_put;
@@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(idmap, temp_inode,
+ ret = inode_permission(idmap, &temp_inode->vfs_inode,
MAY_READ | MAY_EXEC);
- iput(temp_inode);
+ iput(&temp_inode->vfs_inode);
if (ret) {
ret = -EACCES;
goto out_put;
@@ -2571,7 +2570,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -2763,7 +2762,7 @@ out_free:
return ret;
}
-static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -2817,7 +2816,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
return ret;
}
-static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
@@ -4248,7 +4247,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
return 0;
}
-static int check_feature_bits(struct btrfs_fs_info *fs_info,
+static int check_feature_bits(const struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
@@ -4384,7 +4383,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4415,7 +4414,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(inode, arg);
+ ret = btrfs_ioctl_send(root, arg);
kfree(arg);
return ret;
}
@@ -5242,7 +5241,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(inode, argp);
+ return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5264,9 +5263,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(inode, argp);
+ return btrfs_ioctl_tree_search(root, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(inode, argp);
+ return btrfs_ioctl_tree_search_v2(root, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
@@ -5314,10 +5313,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
+ return _btrfs_ioctl_send(root, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
+ return _btrfs_ioctl_send(root, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ce915fcda43b..e08ea446cf48 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -9,6 +9,8 @@ struct file;
struct dentry;
struct mnt_idmap;
struct fileattr;
+struct io_uring_cmd;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_ioctl_balance_args;
@@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9a7a7b723305..81e62b652e21 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <asm/bug.h>
#include <trace/events/btrfs.h>
-#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 4aca7475fd82..03c945711003 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
/*
* Start IO and wait for a given ordered extent to finish.
*
- * Wait on page writeback for all the pages in the extent and the IO completion
- * code to insert metadata into the btree corresponding to the extent.
+ * Wait on page writeback for all the pages in the extent but not in
+ * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
+ * IO completion code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
- if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
+ if (!nowriteback_len) {
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ } else {
+ if (start < nowriteback_start)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
+ nowriteback_start - 1);
+ if (nowriteback_start + nowriteback_len < end)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
+ nowriteback_start + nowriteback_len,
+ end);
+ }
+ }
if (!freespace_inode)
btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4e152736d06c..1e6b0b182b29 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -17,6 +17,7 @@
struct inode;
struct page;
struct extent_state;
+struct btrfs_block_group;
struct btrfs_inode;
struct btrfs_root;
struct btrfs_fs_info;
@@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len);
+static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ return btrfs_start_ordered_extent_nowriteback(entry, 0, 0);
+}
+
int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 8504bf1702c7..d0e620bf5f5a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+#include <linux/types.h>
+
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b8fa34e16abb..adc956432d2f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -26,8 +26,8 @@ struct prop_handler {
const char *xattr_name;
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
- int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(const struct inode *inode);
+ int (*apply)(struct btrfs_inode *inode, const char *value, size_t len);
+ const char *(*extract)(const struct btrfs_inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, NULL, 0);
+ ret = handler->apply(inode, NULL, 0);
ASSERT(ret == 0);
return ret;
@@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, value, value_len);
+ ret = handler->apply(inode, value, value_len);
if (ret) {
btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
@@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- ret = handler->apply(inode, value, len);
+ ret = handler->apply(BTRFS_I(inode), value, len);
if (unlikely(ret))
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
@@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx,
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct btrfs_root *root = inode->root;
+ u64 ino = btrfs_ino(inode);
- return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+ return iterate_object_props(root, path, ino, inode_prop_iterator,
+ &inode->vfs_inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
@@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
return -EINVAL;
}
-static int prop_compression_apply(struct inode *inode, const char *value,
+static int prop_compression_apply(struct btrfs_inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int type;
/* Reset to defaults */
if (len == 0) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
/* Set NOCOMPRESS flag */
if ((len == 2 && strncmp("no", value, 2) == 0) ||
(len == 4 && strncmp("none", value, 4) == 0)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
@@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return -EINVAL;
}
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = type;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
+ inode->prop_compress = type;
return 0;
}
@@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(const struct inode *inode)
+static const char *prop_compression_extract(const struct btrfs_inode *inode)
{
- switch (BTRFS_I(inode)->prop_compress) {
+ switch (inode->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
case BTRFS_COMPRESS_LZO:
case BTRFS_COMPRESS_ZSTD:
- return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ return btrfs_compress_type2str(inode->prop_compress);
default:
break;
}
@@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, const struct inode *parent)
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *parent)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
bool need_reserve = false;
- if (!test_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(parent)->runtime_flags))
+ if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags))
return 0;
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
@@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
- if (h->ignore(BTRFS_I(inode)))
+ if (h->ignore(inode))
continue;
value = h->extract(parent);
@@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(BTRFS_I(inode), value, strlen(value));
+ ret = h->validate(inode, value, strlen(value));
if (ret)
continue;
@@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value,
strlen(value), 0);
if (!ret) {
ret = h->apply(inode, value, strlen(value));
if (ret)
- btrfs_setxattr(trans, inode, h->xattr_name,
+ btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name,
NULL, 0, 0);
else
- set_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
}
if (need_reserve) {
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 63546d0a9444..15d9a025c923 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,9 +6,9 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
+#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct btrfs_inode;
struct btrfs_path;
struct btrfs_trans_handle;
@@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
const char *value, size_t value_len);
bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- const struct inode *dir);
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f9d3766c809b..d6fa36674270 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = 0;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e233cc79af18..a979fd59a4da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args;
struct btrfs_trans_handle;
struct btrfs_delayed_ref_root;
struct btrfs_inode;
+struct btrfs_transaction;
+struct btrfs_block_group;
+struct btrfs_qgroup_swapped_blocks;
/*
* Btrfs qgroup overview
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 541836421778..69942ad43140 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"
+#include "accessors.h"
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f0824c948cb7..15c296cb4dac 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -165,7 +165,7 @@ out:
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
-static int clone_copy_inline_extent(struct inode *dst,
+static int clone_copy_inline_extent(struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
@@ -175,8 +175,8 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
- struct btrfs_root *root = BTRFS_I(dst)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
@@ -185,12 +185,12 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
- key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -205,7 +205,7 @@ static int clone_copy_inline_extent(struct inode *dst,
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
@@ -214,7 +214,7 @@ static int clone_copy_inline_extent(struct inode *dst,
ASSERT(key.offset > 0);
goto copy_to_page;
}
- } else if (i_size_read(dst) <= datal) {
+ } else if (i_size_read(&inode->vfs_inode) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -236,7 +236,7 @@ copy_inline_extent:
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
- if (i_size_read(dst) > datal) {
+ if (i_size_read(&inode->vfs_inode) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
@@ -270,7 +270,7 @@ copy_inline_extent:
drop_args.start = drop_start;
drop_args.end = aligned_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
@@ -281,9 +281,9 @@ copy_inline_extent:
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- btrfs_set_inode_full_sync(BTRFS_I(dst));
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+ btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(inode);
+ ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
out:
if (!ret && !trans) {
/*
@@ -318,7 +318,7 @@ copy_to_page:
*/
btrfs_release_path(path);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
@@ -526,7 +526,7 @@ process_slot:
goto out;
}
- ret = clone_copy_inline_extent(inode, path, &new_key,
+ ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
@@ -617,26 +617,26 @@ out:
return ret;
}
-static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
if (inode1 < inode2)
swap(inode1, inode2);
- down_write(&BTRFS_I(inode1)->i_mmap_lock);
- down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+ down_write(&inode1->i_mmap_lock);
+ down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
-static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- up_write(&BTRFS_I(inode1)->i_mmap_lock);
- up_write(&BTRFS_I(inode2)->i_mmap_lock);
+ up_write(&inode1->i_mmap_lock);
+ up_write(&inode2->i_mmap_lock);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
+ struct btrfs_inode *dst, u64 dst_loff)
{
const u64 end = dst_loff + len - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ struct btrfs_fs_info *fs_info = src->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
@@ -646,9 +646,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
+ lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
+ ALIGN(len, bs), dst_loff, 1);
+ unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -678,8 +679,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
+ BTRFS_I(dst), dst_loff);
if (ret)
goto out;
@@ -688,7 +689,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
}
if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
+ BTRFS_I(dst), dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
@@ -775,24 +777,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
+ struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
+ struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
+ u64 bs = inode_out->root->fs_info->sectorsize;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+ struct btrfs_root *root_out = inode_out->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
- ASSERT(inode_in->i_sb == inode_out->i_sb);
+ ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
/* Don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
+ (inode_out->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
@@ -811,7 +813,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
@@ -832,16 +834,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
- ret = filemap_flush(inode_in->i_mapping);
+ ret = filemap_flush(inode_in->vfs_inode.i_mapping);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
if (ret < 0)
return ret;
@@ -863,8 +863,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
+ struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
+ struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
bool same_inode = dst_inode == src_inode;
int ret;
@@ -872,9 +872,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
} else {
- lock_two_nondirectories(src_inode, dst_inode);
+ lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
}
@@ -884,16 +884,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
+ &dst_inode->vfs_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
- unlock_two_nondirectories(src_inode, dst_inode);
+ unlock_two_nondirectories(&src_inode->vfs_inode,
+ &dst_inode->vfs_inode);
}
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index af0969b70b53..f948f4f6431c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3239,21 +3239,23 @@ out:
return ret;
}
-static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group,
+static int delete_block_group_cache(struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_inode *btrfs_inode;
int ret = 0;
if (inode)
goto truncate;
- inode = btrfs_iget(ino, root);
- if (IS_ERR(inode))
+ btrfs_inode = btrfs_iget(ino, root);
+ if (IS_ERR(btrfs_inode))
return -ENOENT;
+ inode = &btrfs_inode->vfs_inode;
truncate:
ret = btrfs_check_trunc_cache_free_space(fs_info,
@@ -3313,8 +3315,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
}
if (!found)
return -ENOENT;
- ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
- space_cache_ino);
+ ret = delete_block_group_cache(block_group, NULL, space_cache_ino);
return ret;
}
@@ -3761,10 +3762,10 @@ out:
* the inode is in data relocation tree and its link count is 0
*/
static noinline_for_stack struct inode *create_reloc_inode(
- struct btrfs_fs_info *fs_info,
const struct btrfs_block_group *group)
{
- struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info = group->fs_info;
+ struct btrfs_inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
@@ -3792,18 +3793,19 @@ static noinline_for_stack struct inode *create_reloc_inode(
inode = NULL;
goto out;
}
- BTRFS_I(inode)->reloc_block_group_start = group->start;
+ inode->reloc_block_group_start = group->start;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (ret) {
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
- return inode;
+ return &inode->vfs_inode;
}
/*
@@ -3977,7 +3979,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ ret = delete_block_group_cache(rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
@@ -3986,7 +3988,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ rc->data_inode = create_reloc_inode(rc->block_group);
if (IS_ERR(rc->data_inode)) {
err = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 531312efee8d..2c5edcee9450 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1380,11 +1380,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
if (path->nodes[0])
goto search_forward;
+ key.objectid = search_start;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = search_start;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -2497,8 +2497,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
path->skip_locking = 1;
key.objectid = scrub_dev->devid;
- key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0ull;
while (1) {
u64 dev_extent_len;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f437138fefbc..0c8c58c4f29b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -16,7 +16,6 @@
#include <linux/compat.h>
#include <linux/crc32c.h>
#include <linux/fsverity.h>
-
#include "send.h"
#include "ctree.h"
#include "backref.h"
@@ -178,6 +177,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ struct fs_path cur_inode_path;
bool cur_inode_new;
bool cur_inode_new_gen;
bool cur_inode_deleted;
@@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx)
static void fs_path_reset(struct fs_path *p)
{
- if (p->reversed) {
+ if (p->reversed)
p->start = p->buf + p->buf_len - 1;
- p->end = p->start;
- *p->start = 0;
- } else {
+ else
p->start = p->buf;
- p->end = p->start;
- *p->start = 0;
- }
+
+ p->end = p->start;
+ *p->start = 0;
+}
+
+static void init_path(struct fs_path *p)
+{
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
}
static struct fs_path *fs_path_alloc(void)
@@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void)
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
- p->reversed = 0;
- p->buf = p->inline_buf;
- p->buf_len = FS_PATH_INLINE_SIZE;
- fs_path_reset(p);
+ init_path(p);
return p;
}
@@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p)
kfree(p);
}
-static int fs_path_len(struct fs_path *p)
+static inline int fs_path_len(const struct fs_path *p)
{
return p->end - p->start;
}
@@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
- path_len = p->end - p->start;
+ path_len = fs_path_len(p);
old_buf_len = p->buf_len;
/*
@@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
int ret;
int new_len;
- new_len = p->end - p->start + name_len;
+ new_len = fs_path_len(p) + name_len;
if (p->start != p->end)
new_len++;
ret = fs_path_ensure_buf(p, new_len);
if (ret < 0)
- goto out;
+ return ret;
if (p->reversed) {
if (p->start != p->end)
@@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
*p->end = 0;
}
-out:
- return ret;
+ return 0;
}
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
@@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len)
ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
memcpy(prepared, name, name_len);
-out:
- return ret;
+ return 0;
}
-static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2)
{
- int ret;
- char *prepared;
-
- ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
- if (ret < 0)
- goto out;
- memcpy(prepared, p2->start, p2->end - p2->start);
-
-out:
- return ret;
+ return fs_path_add(p, p2->start, fs_path_len(p2));
}
static int fs_path_add_from_extent_buffer(struct fs_path *p,
@@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(eb, prepared, off, len);
-out:
- return ret;
+ return 0;
}
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
@@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p)
return;
tmp = p->start;
- len = p->end - p->start;
+ len = fs_path_len(p);
p->start = p->buf;
p->end = p->start + len;
memmove(p->start, tmp, len + 1);
p->reversed = 0;
}
+static inline bool is_current_inode_path(const struct send_ctx *sctx,
+ const struct fs_path *path)
+{
+ const struct fs_path *cur = &sctx->cur_inode_path;
+
+ return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0);
+}
+
static struct btrfs_path *alloc_path_for_send(void)
{
struct btrfs_path *path;
@@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
#define TLV_PUT_PATH(sctx, attrtype, p) \
do { \
ret = tlv_put_string(sctx, attrtype, p->start, \
- p->end - p->start); \
+ fs_path_len((p))); \
if (ret < 0) \
goto tlv_put_failure; \
} while(0)
@@ -826,7 +823,7 @@ static int send_rename(struct send_ctx *sctx,
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
@@ -834,7 +831,6 @@ static int send_rename(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -851,7 +847,7 @@ static int send_link(struct send_ctx *sctx,
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
@@ -859,7 +855,6 @@ static int send_link(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -875,14 +870,13 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -898,14 +892,13 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -1897,7 +1890,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
if (send_gen)
@@ -1908,7 +1901,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
} else {
ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
if (parent_gen)
@@ -1953,7 +1946,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = -ENOENT;
}
-out:
return ret;
}
@@ -1967,17 +1959,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
- goto out;
+ return ret;
if (ret == inode_state_no_change ||
ret == inode_state_did_create ||
ret == inode_state_will_delete)
- ret = 1;
- else
- ret = 0;
+ return 1;
-out:
- return ret;
+ return 0;
}
/*
@@ -2326,9 +2315,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
if (ret < 0)
- goto out;
- ret = nce->ret;
- goto out;
+ return ret;
+ return nce->ret;
}
}
@@ -2339,12 +2327,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*/
ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
goto out_cache;
}
@@ -2360,21 +2348,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
ret = get_first_ref(sctx->parent_root, ino,
parent_ino, parent_gen, dest);
if (ret < 0)
- goto out;
+ return ret;
/*
* Check if the ref was overwritten by an inode's ref that was processed
* earlier. If yes, treat as orphan and return 1.
*/
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
- dest->start, dest->end - dest->start);
+ dest->start, fs_path_len(dest));
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
fs_path_reset(dest);
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
}
@@ -2383,10 +2371,8 @@ out_cache:
* Store the result of the lookup in the name cache.
*/
nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
- if (!nce) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!nce)
+ return -ENOMEM;
nce->entry.key = ino;
nce->entry.gen = gen;
@@ -2404,10 +2390,9 @@ out_cache:
nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
if (nce_ret < 0) {
kfree(nce);
- ret = nce_ret;
+ return nce_ret;
}
-out:
return ret;
}
@@ -2444,6 +2429,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
+ const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
+
+ if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) {
+ if (dest != &sctx->cur_inode_path)
+ return fs_path_copy(dest, &sctx->cur_inode_path);
+
+ return 0;
+ }
name = fs_path_alloc();
if (!name) {
@@ -2495,8 +2488,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
out:
fs_path_free(name);
- if (!ret)
+ if (!ret) {
fs_path_unreverse(dest);
+ if (is_cur_inode && dest != &sctx->cur_inode_path)
+ ret = fs_path_copy(&sctx->cur_inode_path, dest);
+ }
+
return ret;
}
@@ -2591,6 +2588,47 @@ out:
return ret;
}
+static struct fs_path *get_cur_inode_path(struct send_ctx *sctx)
+{
+ if (fs_path_len(&sctx->cur_inode_path) == 0) {
+ int ret;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ &sctx->cur_inode_path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ return &sctx->cur_inode_path;
+}
+
+static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ struct fs_path *path;
+ int ret;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ return get_cur_inode_path(sctx);
+
+ path = fs_path_alloc();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = get_cur_path(sctx, ino, gen, path);
+ if (ret < 0) {
+ fs_path_free(path);
+ return ERR_PTR(ret);
+ }
+
+ return path;
+}
+
+static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path)
+{
+ if (path != &sctx->cur_inode_path)
+ fs_path_free(path);
+}
+
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2599,17 +2637,14 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
@@ -2617,7 +2652,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2629,17 +2664,14 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
@@ -2647,7 +2679,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2662,17 +2694,14 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
@@ -2680,7 +2709,7 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2693,17 +2722,14 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
ino, uid, gid);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
@@ -2712,7 +2738,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2729,9 +2755,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
btrfs_debug(fs_info, "send_utimes %llu", ino);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
path = alloc_path_for_send();
if (!path) {
@@ -2756,9 +2782,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
@@ -2770,7 +2793,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
btrfs_free_path(path);
return ret;
}
@@ -3106,6 +3129,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
ret = send_rename(sctx, path, orphan);
+ if (ret < 0)
+ goto out;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ ret = fs_path_copy(&sctx->cur_inode_path, orphan);
out:
fs_path_free(orphan);
@@ -4158,6 +4186,23 @@ out:
return ret;
}
+static int rename_current_inode(struct send_ctx *sctx,
+ struct fs_path *current_path,
+ struct fs_path *new_path)
+{
+ int ret;
+
+ ret = send_rename(sctx, current_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ ret = fs_path_copy(&sctx->cur_inode_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ return fs_path_copy(current_path, new_path);
+}
+
/*
* This does all the move/link/unlink/rmdir magic.
*/
@@ -4172,9 +4217,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- int did_overwrite = 0;
- int is_orphan = 0;
u64 last_dir_ino_rm = 0;
+ bool did_overwrite = false;
+ bool is_orphan = false;
bool can_rename = true;
bool orphanized_dir = false;
bool orphanized_ancestor = false;
@@ -4216,14 +4261,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (ret)
- did_overwrite = 1;
+ did_overwrite = true;
}
if (sctx->cur_inode_new || did_overwrite) {
ret = gen_unique_name(sctx, sctx->cur_ino,
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
} else {
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
valid_path);
@@ -4348,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret > 0) {
orphanized_ancestor = true;
fs_path_reset(valid_path);
+ fs_path_reset(&sctx->cur_inode_path);
ret = get_cur_path(sctx, sctx->cur_ino,
sctx->cur_inode_gen,
valid_path);
@@ -4443,13 +4489,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* it depending on the inode mode.
*/
if (is_orphan && can_rename) {
- ret = send_rename(sctx, valid_path, cur->full_path);
- if (ret < 0)
- goto out;
- is_orphan = 0;
- ret = fs_path_copy(valid_path, cur->full_path);
+ ret = rename_current_inode(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
+ is_orphan = false;
} else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
@@ -4457,10 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
+ ret = rename_current_inode(sctx, valid_path,
cur->full_path);
if (ret < 0)
goto out;
@@ -4507,7 +4547,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
@@ -4553,6 +4593,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
+ if (is_current_inode_path(sctx, cur->full_path))
+ fs_path_reset(&sctx->cur_inode_path);
}
ret = dup_ref(cur, &check_dirs);
if (ret < 0)
@@ -4701,7 +4743,7 @@ out:
static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4710,7 +4752,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4724,13 +4766,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
&sctx->new_refs, name, dir, dir_gen,
sctx);
}
-out:
+
return ret;
}
static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4739,7 +4781,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4753,7 +4795,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
&sctx->deleted_refs, name, dir,
dir_gen, sctx);
}
-out:
+
return ret;
}
@@ -4764,11 +4806,9 @@ static int record_new_ref(struct send_ctx *sctx)
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_deleted_ref(struct send_ctx *sctx)
@@ -4779,29 +4819,25 @@ static int record_deleted_ref(struct send_ctx *sctx)
sctx->cmp_key, 0, record_deleted_ref_if_needed,
sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_changed_ref(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
+ return ret;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
/*
@@ -4869,15 +4905,19 @@ out:
}
static int send_set_xattr(struct send_ctx *sctx,
- struct fs_path *path,
const char *name, int name_len,
const char *data, int data_len)
{
- int ret = 0;
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4886,7 +4926,6 @@ static int send_set_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4894,11 +4933,11 @@ static int send_remove_xattr(struct send_ctx *sctx,
struct fs_path *path,
const char *name, int name_len)
{
- int ret = 0;
+ int ret;
ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4906,7 +4945,6 @@ static int send_remove_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4914,19 +4952,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len, const char *data,
int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
/* Capabilities are emitted by finish_inode_if_needed */
if (!strncmp(name, XATTR_NAME_CAPS, name_len))
return 0;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
/*
* This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
@@ -4943,48 +4975,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
}
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_set_xattr(sctx, name, name_len, data, data_len);
}
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_remove_xattr(sctx, p, name, name_len);
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
-out:
- fs_path_free(p);
- return ret;
+ return send_remove_xattr(sctx, p, name, name_len);
}
static int process_new_xattr(struct send_ctx *sctx)
{
- int ret = 0;
-
- ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- __process_new_xattr, sctx);
-
- return ret;
+ return iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_new_xattr, sctx);
}
static int process_deleted_xattr(struct send_ctx *sctx)
@@ -5100,17 +5111,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
static int process_changed_xattr(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
__process_changed_new_xattr, sctx);
if (ret < 0)
- goto out;
- ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
- __process_changed_deleted_xattr, sctx);
+ return ret;
-out:
- return ret;
+ return iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_changed_deleted_xattr, sctx);
}
static int process_all_new_xattrs(struct send_ctx *sctx)
@@ -5157,7 +5166,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
@@ -5172,21 +5181,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *p;
inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0);
if (ret < 0)
goto iput;
@@ -5203,27 +5211,19 @@ static int process_verity(struct send_ctx *sctx)
}
}
- ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret);
if (ret < 0)
goto iput;
- p = fs_path_alloc();
- if (!p) {
- ret = -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
goto iput;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto free_path;
ret = send_verity(sctx, p, sctx->verity_descriptor);
- if (ret < 0)
- goto free_path;
-
-free_path:
- fs_path_free(p);
iput:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5343,31 +5343,25 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
- if (ret < 0)
- goto out;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
ret = put_file_data(sctx, offset, len);
if (ret < 0)
- goto out;
+ return ret;
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5380,6 +5374,7 @@ static int send_clone(struct send_ctx *sctx,
{
int ret = 0;
struct fs_path *p;
+ struct fs_path *cur_inode_path;
u64 gen;
btrfs_debug(sctx->send_root->fs_info,
@@ -5387,6 +5382,10 @@ static int send_clone(struct send_ctx *sctx,
offset, len, btrfs_root_id(clone_root->root),
clone_root->ino, clone_root->offset);
+ cur_inode_path = get_cur_inode_path(sctx);
+ if (IS_ERR(cur_inode_path))
+ return PTR_ERR(cur_inode_path);
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5395,13 +5394,9 @@ static int send_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path);
if (clone_root->root == sctx->send_root) {
ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
@@ -5452,17 +5447,13 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
@@ -5471,8 +5462,6 @@ static int send_update_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5501,12 +5490,10 @@ static int send_hole(struct send_ctx *sctx, u64 end)
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto tlv_put_failure;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
while (offset < end) {
u64 len = min(end - offset, read_size);
@@ -5527,7 +5514,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
}
sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
- fs_path_free(p);
return ret;
}
@@ -5535,9 +5521,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
struct btrfs_path *path, u64 offset,
u64 len)
{
- struct btrfs_root *root = sctx->send_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5546,23 +5530,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
- goto out;
- }
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -5578,12 +5552,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, ei));
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
ret = put_data_header(sctx, inline_size);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
btrfs_file_extent_inline_start(ei), inline_size);
sctx->send_size += inline_size;
@@ -5591,9 +5565,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(fspath);
- iput(inode);
return ret;
}
@@ -5602,7 +5573,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5617,9 +5588,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (IS_ERR(inode))
return PTR_ERR(inode);
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath)) {
+ ret = PTR_ERR(fspath);
goto out;
}
@@ -5627,10 +5598,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
@@ -5672,7 +5639,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
+ ret = btrfs_encoded_read_regular_fill_pages(inode,
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT),
@@ -5698,8 +5665,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
tlv_put_failure:
out:
- fs_path_free(fspath);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5741,15 +5707,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
}
if (sctx->cur_inode == NULL) {
+ struct btrfs_inode *btrfs_inode;
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(sctx->cur_inode)) {
- int err = PTR_ERR(sctx->cur_inode);
+ btrfs_inode = btrfs_iget(sctx->cur_ino, root);
+ if (IS_ERR(btrfs_inode))
+ return PTR_ERR(btrfs_inode);
- sctx->cur_inode = NULL;
- return err;
- }
+ sctx->cur_inode = &btrfs_inode->vfs_inode;
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
@@ -5828,7 +5793,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct fs_path *fspath = NULL;
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
@@ -5854,25 +5818,19 @@ static int send_capabilities(struct send_ctx *sctx)
leaf = path->nodes[0];
buf_len = btrfs_dir_data_len(leaf, di);
- fspath = fs_path_alloc();
buf = kmalloc(buf_len, GFP_KERNEL);
- if (!fspath || !buf) {
+ if (!buf) {
ret = -ENOMEM;
goto out;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
read_extent_buffer(leaf, buf, data_ptr, buf_len);
- ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
strlen(XATTR_NAME_CAPS), buf, buf_len);
out:
kfree(buf);
- fs_path_free(fspath);
btrfs_free_path(path);
return ret;
}
@@ -6898,6 +6856,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
+ fs_path_reset(&sctx->cur_inode_path);
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -8107,10 +8066,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
@@ -8173,6 +8131,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
goto out;
}
+ init_path(&sctx->cur_inode_path);
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
@@ -8449,6 +8408,9 @@ out:
btrfs_lru_cache_clear(&sctx->dir_created_cache);
btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
+ if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf)
+ kfree(sctx->cur_inode_path.buf);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9309886c5ea1..652bb28f63d4 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct btrfs_inode;
+struct btrfs_root;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index a341d087567a..ff089e3e4103 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/spinlock.h"
+#include <linux/spinlock.h>
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 722acf768396..11dbd7be6a3b 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -2,12 +2,11 @@
#include <linux/slab.h>
#include "messages.h"
-#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
/*
- * Subpage (sectorsize < PAGE_SIZE) support overview:
+ * Subpage (block size < folio size) support overview:
*
* Limitations:
*
@@ -64,35 +63,14 @@
* This means a slightly higher tree locking latency.
*/
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
-{
- if (fs_info->sectorsize >= PAGE_SIZE)
- return false;
-
- /*
- * Only data pages (either through DIO or compression) can have no
- * mapping. And if page->mapping->host is data inode, it's subpage.
- * As we have ruled our sectorsize >= PAGE_SIZE case already.
- */
- if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
- return true;
-
- /*
- * Now the only remaining case is metadata, which we only go subpage
- * routine if nodesize < PAGE_SIZE.
- */
- if (fs_info->nodesize < PAGE_SIZE)
- return true;
- return false;
-}
-#endif
-
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
+ /* For metadata we don't support large folio yet. */
+ ASSERT(!folio_test_large(folio));
+
/*
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
@@ -101,10 +79,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
+ if (folio_test_private(folio))
+ return 0;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return 0;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, type);
+ subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
@@ -112,12 +94,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
+ if (!folio_test_private(folio))
+ return;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
subpage = folio_detach_private(folio);
@@ -126,15 +113,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol
}
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type)
+ size_t fsize, enum btrfs_subpage_type type)
{
struct btrfs_subpage *ret;
unsigned int real_size;
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->sectorsize < fsize);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max *
+ (fsize >> fs_info->sectorsize_bits)));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -165,7 +153,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
@@ -179,7 +167,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
@@ -206,16 +194,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
*/
if (folio->mapping)
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_pos(folio) + folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int __start_bit; \
+ unsigned int __start_bit; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
__start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
+ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -233,7 +223,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + folio_size(folio),
orig_start + orig_len) - *start;
}
@@ -296,7 +286,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -323,13 +313,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+ const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
bool last = false;
int cleared = 0;
int bit;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -341,7 +332,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
}
spin_lock_irqsave(&subpage->lock, flags);
- for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bitmap, blocks_per_folio) {
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++;
}
@@ -352,15 +343,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
folio_unlock(folio);
}
-#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+#define subpage_test_bitmap_all_set(fs_info, folio, name) \
+({ \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
-#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
+({ \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
@@ -372,7 +375,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -426,7 +429,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -467,7 +470,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
@@ -498,7 +501,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -513,7 +516,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -569,7 +572,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -579,7 +582,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -589,7 +592,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
@@ -597,7 +600,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -608,7 +611,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -619,10 +622,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
btrfs_subpage_clamp_range(folio, &start, &len); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+} \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_set_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_clear_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
folio_test_uptodate);
@@ -635,26 +660,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
folio_test_checked);
-#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
+#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
{ \
- const int sectors_per_page = fs_info->sectors_per_page; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_subpage *subpage = folio_get_private(folio); \
\
- ASSERT(sectors_per_page < BITS_PER_LONG); \
+ ASSERT(blocks_per_folio < BITS_PER_LONG); \
*dst = bitmap_read(subpage->bitmaps, \
- sectors_per_page * btrfs_bitmap_nr_##name, \
- sectors_per_page); \
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
}
#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
{ \
- const struct btrfs_subpage *subpage = folio_get_private(folio); \
unsigned long bitmap; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
\
- GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \
+ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
btrfs_warn(fs_info, \
"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
start, len, folio_pos(folio), \
- fs_info->sectors_per_page, &bitmap); \
+ blocks_per_folio, &bitmap); \
}
/*
@@ -672,7 +700,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
ASSERT(!folio_test_dirty(folio));
return;
}
@@ -707,7 +735,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
int ret;
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
subpage = folio_get_private(folio);
@@ -721,15 +749,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
}
bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->nr_locked);
- ASSERT(ret <= fs_info->sectors_per_page);
+ ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
spin_unlock_irqrestore(&subpage->lock, flags);
}
+/*
+ * Clear the dirty flag for the folio.
+ *
+ * If the affected folio is no longer dirty, return true. Otherwise return false.
+ */
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb)
+{
+ bool last;
+
+ if (!btrfs_meta_is_subpage(eb->fs_info)) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+
+ last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len);
+ if (last) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+ return false;
+}
+
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- const u32 sectors_per_page = fs_info->sectors_per_page;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
@@ -739,28 +789,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(sectors_per_page > 1);
+ ASSERT(blocks_per_folio > 1);
subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
- sectors_per_page, &uptodate_bitmap,
- sectors_per_page, &dirty_bitmap,
- sectors_per_page, &locked_bitmap,
- sectors_per_page, &writeback_bitmap,
- sectors_per_page, &ordered_bitmap,
- sectors_per_page, &checked_bitmap);
+ blocks_per_folio, &uptodate_bitmap,
+ blocks_per_folio, &dirty_bitmap,
+ blocks_per_folio, &locked_bitmap,
+ blocks_per_folio, &writeback_bitmap,
+ blocks_per_folio, &ordered_bitmap,
+ blocks_per_folio, &checked_bitmap);
}
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
@@ -771,10 +821,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(fs_info->sectors_per_page > 1);
+ ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 44fff1f4eac4..3042c5ea840a 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,11 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include "btrfs_inode.h"
+#include "fs.h"
struct address_space;
struct folio;
-struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -69,23 +70,49 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
+#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE
+/*
+ * Subpage support for metadata is more complex, as we can have dummy extent
+ * buffers, where folios have no mapping to determine the owning inode.
+ *
+ * Thankfully we only need to check if node size is smaller than page size.
+ * Even with larger folio support, we will only allocate a folio as large as
+ * node size.
+ * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine.
+ */
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->nodesize < PAGE_SIZE;
+}
+static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
+ struct folio *folio)
+{
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
+ return fs_info->sectorsize < folio_size(folio);
+}
#else
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct address_space *mapping)
+ struct folio *folio)
{
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
return false;
}
#endif
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_subpage_type type);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type);
+ size_t fsize, enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
@@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
* btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
+ *
+ * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios.
+ *
+ * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there
+ * is no clamp version for metadata helpers, as we either go subpage
+ * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE,
+ * and our folio is never larger than nodesize).
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len); \
bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct folio *folio, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dc4fee519ca6..40709e2a44fc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -84,7 +84,7 @@ struct btrfs_fs_context {
u32 thread_pool_size;
unsigned long long mount_opt;
unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_level;
refcount_t refs;
};
@@ -947,7 +947,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
int err;
@@ -982,7 +982,7 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}
- sb->s_root = d_make_root(inode);
+ sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
err = -ENOMEM;
goto fail_close;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 14f53f757555..b9af74498b0c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
+ ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
@@ -1342,17 +1343,18 @@ int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
/* Separate value from input in policy:value format. */
value_str = strchr(param, ':');
if (value_str) {
- int ret;
+ char *retptr;
*value_str = 0;
value_str++;
if (!value_ret)
return -EINVAL;
- ret = kstrtos64(value_str, 10, value_ret);
- if (ret)
+
+ *value_ret = memparse(value_str, &retptr);
+ /* There could be any trailing typos after the value. */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || *value_ret <= 0)
return -EINVAL;
- if (*value_ret < 0)
- return -ERANGE;
}
#endif
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3fc5c6f90dc4..0f94ae923210 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -7,6 +7,7 @@
#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct block_device;
struct btrfs_fs_info;
struct btrfs_device;
struct btrfs_fs_devices;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 0a2dbfaaf49e..74aca7180a5a 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -525,7 +525,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, 0);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -542,7 +542,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
* Test again for case where the tree block is sectorsize aligned but
* not nodesize aligned.
*/
- eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, sectorsize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -730,7 +730,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, SZ_1M);
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 56e61ac1cc64..609bb6c9c087 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
test_err("error adding chunk map to mapping tree");
+ btrfs_free_chunk_map(map);
goto out_free;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index aca83a98b75a..f26a394a9ec5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the deleted_bgs list during a transaction abort.
+ */
+ spin_lock(&transaction->fs_info->unused_bgs_lock);
list_del_init(&cache->bg_list);
+ spin_unlock(&transaction->fs_info->unused_bgs_lock);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
@@ -1635,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = &pending->dir->vfs_inode;
+ struct btrfs_inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1661,7 +1667,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* filesystem.
*/
nofs_flags = memalloc_nofs_save();
- pending->error = fscrypt_setup_filename(parent_inode,
+ pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
@@ -1690,8 +1696,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.objectid = objectid;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
@@ -1699,16 +1705,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- parent_root = BTRFS_I(parent_inode)->root;
+ parent_root = parent_inode->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
- cur_time = current_time(parent_inode);
+ cur_time = current_time(&parent_inode->vfs_inode);
/*
* insert the directory item
*/
- ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
+ ret = btrfs_set_inode_index(parent_inode, &index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1716,7 +1722,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
- btrfs_ino(BTRFS_I(parent_inode)),
+ btrfs_ino(parent_inode),
&fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
@@ -1817,7 +1823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_add_root_ref(trans, objectid,
btrfs_root_id(parent_root),
- btrfs_ino(BTRFS_I(parent_inode)), index,
+ btrfs_ino(parent_inode), index,
&fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1855,18 +1861,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
- BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ parent_inode, &key, BTRFS_FT_DIR,
index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
- btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
+ btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
fname.disk_name.len * 2);
- inode_set_mtime_to_ts(parent_inode,
- inode_set_ctime_current(parent_inode));
- ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, parent_inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -2096,7 +2102,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the new_bgs list during a transaction abort.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
}
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 955d1677e865..90dc094cfa5e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* We're holding a transaction handle whether we are logging or
@@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log,
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given eb, slot and key all refer to
+ * the source data we are copying out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
/*
@@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
@@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
+ if (!src_copy) {
btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
return -ENOMEM;
}
read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
-
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
u64 nbytes;
u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
item = btrfs_item_ptr(eb, slot,
struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, nbytes);
@@ -514,11 +511,13 @@ insert:
key, item_size);
path->skip_release_on_error = 0;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
- u32 found_size;
- found_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
btrfs_truncate_item(trans, path, item_size, 1);
else if (found_size < item_size)
@@ -526,8 +525,7 @@ insert:
} else if (ret) {
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -546,7 +544,6 @@ insert:
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0) {
- struct extent_buffer *dst_eb = path->nodes[0];
const u64 ino_size = btrfs_inode_size(eb, src_item);
/*
@@ -564,30 +561,28 @@ insert:
}
if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
btrfs_release_path(path);
@@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
+static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
- inode = NULL;
+ return NULL;
return inode;
}
@@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;
@@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
btrfs_release_path(path);
goto out;
}
@@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
@@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
(unsigned long)item, sizeof(*item));
ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
offset = key->offset - btrfs_file_extent_offset(eb, item);
/*
@@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
@@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1148,7 +1134,7 @@ again:
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
- struct inode *victim_parent;
+ struct btrfs_inode *victim_parent;
leaf = path->nodes[0];
@@ -1188,10 +1174,10 @@ again:
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
+ victim_parent,
inode, &victim_name);
}
- iput(victim_parent);
+ iput(&victim_parent->vfs_inode);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1325,7 +1311,7 @@ again:
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
btrfs_release_path(path);
dir = read_one_inode(root, parent_id);
@@ -1334,10 +1320,9 @@ again:
kfree(name.name);
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
@@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
+ ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
@@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
if (ret)
goto out;
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
@@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
kfree(name.name);
name.name = NULL;
if (log_ref_ver) {
- iput(dir);
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
goto out;
@@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1611,25 +1595,25 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = count_inode_refs(BTRFS_I(inode), path);
+ ret = count_inode_refs(inode, path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(BTRFS_I(inode), path);
+ ret = count_inode_extrefs(inode, path);
if (ret < 0)
goto out;
@@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
ino, 1);
if (ret)
@@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
@@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
}
ret = fixup_inode_link_count(trans, inode);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
@@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
inode = read_one_inode(root, objectid);
if (!inode)
return -EIO;
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
@@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
} else if (ret == -EEXIST) {
ret = 0;
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
inode = read_one_inode(root, location->objectid);
@@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
dir = read_one_inode(root, dirid);
if (!dir) {
- iput(inode);
+ iput(&inode->vfs_inode);
return -EIO;
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
@@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
@@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2184,7 +2167,8 @@ out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
- struct inode *dir;
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
@@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
- iput(dir);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
inode = read_one_inode(root, key.objectid);
@@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
ret = -EIO;
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
+ ret = btrfs_drop_extents(wc->trans, root, inode,
&drop_args);
if (!ret) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- BTRFS_I(inode));
+ ret = btrfs_update_inode(wc->trans, inode);
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
@@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
- key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = first_offset;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
/*
* -EEXIST is fine and can happen sporadically when we are logging a
@@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5497,7 +5478,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5524,17 +5505,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5576,14 +5556,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6789,7 +6767,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
- struct inode *dir_inode;
+ struct btrfs_inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
@@ -6838,18 +6816,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -6874,7 +6850,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6889,11 +6865,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -7061,26 +7036,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
- bool log_dentries = false;
+ bool log_dentries;
- if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* If we're logging an inode from a subvolume created in the current
* transaction we must force a commit since the root is not persisted.
*/
- if (btrfs_root_generation(&root->root_item) == trans->transid) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
@@ -7089,14 +7058,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
*/
if ((btrfs_inode_in_log(inode, trans->transid) &&
list_empty(&ctx->ordered_extents)) ||
- inode->vfs_inode.i_nlink == 0) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ inode->vfs_inode.i_nlink == 0)
+ return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_no_trans;
+ return ret;
ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
@@ -7115,8 +7082,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
- log_dentries = true;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
/*
* On unlink we must make sure all our current and old parent directory
@@ -7171,8 +7141,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (log_dentries)
ret = log_new_dir_dentries(trans, inode, ctx);
- else
- ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
@@ -7182,7 +7150,7 @@ end_trans:
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -7247,8 +7215,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index e97ad824ae16..b7a96a005487 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode)
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
goto out;
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3f8afbd1ebb5..c8c21c55be53 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1798,8 +1798,8 @@ again:
path->skip_locking = 1;
key.objectid = device->devid;
- key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = search_start;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
@@ -1918,8 +1918,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = device->devid;
- key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -2721,8 +2721,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = 0;
while (1) {
btrfs_reserve_chunk_metadata(trans, false);
@@ -3119,8 +3119,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -3577,8 +3577,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -4184,8 +4184,8 @@ again:
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
@@ -5001,8 +5001,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
again:
key.objectid = device->devid;
- key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = (u64)-1;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -7539,8 +7539,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *node = path->nodes[1];
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 120f65e21eeb..e247d551da67 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -7,6 +7,7 @@
#define BTRFS_VOLUMES_H
#include <linux/blk_types.h>
+#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/atomic.h>
#include <linux/sort.h>
@@ -18,14 +19,17 @@
#include <linux/completion.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
#include "rcu-string.h"
+#include "extent-io-tree.h"
struct block_device;
struct bdev_handle;
struct btrfs_fs_info;
struct btrfs_block_group;
struct btrfs_trans_handle;
+struct btrfs_transaction;
struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 8dc4cf49f6f0..0ce10e4ec836 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
+#include <linux/types.h>
+
struct dentry;
struct inode;
struct qstr;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c9e92c6941ec..545f413d81fc 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -94,6 +94,47 @@ fail:
return ERR_PTR(-ENOMEM);
}
+/*
+ * Helper for S390x with hardware zlib compression support.
+ *
+ * That hardware acceleration requires a buffer size larger than a single page
+ * to get ideal performance, thus we need to do the memory copy rather than
+ * use the page cache directly as input buffer.
+ */
+static int copy_data_into_buffer(struct address_space *mapping,
+ struct workspace *workspace, u64 filepos,
+ unsigned long length)
+{
+ u64 cur = filepos;
+
+ /* It's only for hardware accelerated zlib code. */
+ ASSERT(zlib_deflate_dfltcc_enabled());
+
+ while (cur < filepos + length) {
+ struct folio *folio;
+ void *data_in;
+ unsigned int offset;
+ unsigned long copy_length;
+ int ret;
+
+ ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
+ if (ret < 0)
+ return ret;
+ /* No large folio support yet. */
+ ASSERT(!folio_test_large(folio));
+
+ offset = offset_in_folio(folio, cur);
+ copy_length = min(folio_size(folio) - offset,
+ filepos + length - cur);
+
+ data_in = kmap_local_folio(folio, offset);
+ memcpy(workspace->buf + cur - filepos, data_in, copy_length);
+ kunmap_local(data_in);
+ cur += copy_length;
+ }
+ return 0;
+}
+
int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
@@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
- unsigned long bytes_left;
- unsigned int in_buf_folios;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
@@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
* the workspace buffer if required.
*/
if (workspace->strm.avail_in == 0) {
- bytes_left = len - workspace->strm.total_in;
- in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_folios > 1) {
- int i;
-
- /* S390 hardware acceleration path, not subpage. */
- ASSERT(!btrfs_is_subpage(
- inode_to_fs_info(mapping->host),
- mapping));
- for (i = 0; i < in_buf_folios; i++) {
- if (data_in) {
- kunmap_local(data_in);
- folio_put(in_folio);
- data_in = NULL;
- }
- ret = btrfs_compress_filemap_get_folio(mapping,
- start, &in_folio);
- if (ret < 0)
- goto out;
- data_in = kmap_local_folio(in_folio, 0);
- copy_page(workspace->buf + i * PAGE_SIZE,
- data_in);
- start += PAGE_SIZE;
- }
+ unsigned long bytes_left = len - workspace->strm.total_in;
+ unsigned int copy_length = min(bytes_left, workspace->buf_size);
+
+ /*
+ * This can only happen when hardware zlib compression is
+ * enabled.
+ */
+ if (copy_length > PAGE_SIZE) {
+ ret = copy_data_into_buffer(mapping, workspace,
+ start, copy_length);
+ if (ret < 0)
+ goto out;
+ start += copy_length;
workspace->strm.next_in = workspace->buf;
- workspace->strm.avail_in = min(bytes_left,
- in_buf_folios << PAGE_SHIFT);
+ workspace->strm.avail_in = copy_length;
} else {
unsigned int pg_off;
unsigned int cur_len;
@@ -463,6 +489,7 @@ out:
const struct btrfs_compress_op btrfs_zlib_compress = {
.workspace_manager = &wsm,
+ .min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 73e0aa9fc08a..fb8b8b29c169 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2111,6 +2111,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2272,6 +2275,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_zoned_device_info *zinfo = device->zone_info;
unsigned int nofs_flags;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2325,6 +2331,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!btrfs_is_zoned(fs_info))
return true;
+ if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags))
+ return false;
+
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&fs_info->zone_active_bgs_lock);
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5232b56d5892..cd5f38d6fbaa 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -26,11 +26,12 @@
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MIN_LEVEL -15
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(int level,
size_t src_len)
{
zstd_parameters params = zstd_get_params(level, src_len);
@@ -45,13 +46,14 @@ struct workspace {
void *mem;
size_t size;
char *buf;
- unsigned int level;
- unsigned int req_level;
+ int level;
+ int req_level;
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
zstd_in_buffer in_buf;
zstd_out_buffer out_buf;
+ zstd_parameters params;
};
/*
@@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+static inline int clip_level(int level)
+{
+ return max(0, level - 1);
+}
/*
* Timer callback to free unused workspaces.
@@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_for_each_prev_safe(pos, next, &wsm.lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
- unsigned int level;
+ int level;
if (time_after(victim->last_used, reclaim_threshold))
break;
@@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level - 1]))
- clear_bit(level - 1, &wsm.active_map);
+ if (list_empty(&wsm.idle_ws[level]))
+ clear_bit(level, &wsm.active_map);
}
@@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
static void zstd_calc_ws_mem_sizes(void)
{
size_t max_size = 0;
- unsigned int level;
+ int level;
- for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ if (level == 0)
+ continue;
zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
@@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void)
zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
- zstd_ws_mem_sizes[level - 1] = max_size;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ zstd_ws_mem_sizes[clip_level(level)] = max_size;
}
}
@@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(unsigned int level)
+static struct list_head *zstd_find_workspace(int level)
{
struct list_head *ws;
struct workspace *workspace;
- int i = level - 1;
+ int i = clip_level(level);
spin_lock_bh(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
@@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
- if (level == workspace->level)
+ if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
@@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws)
spin_lock_bh(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
- if (workspace->req_level == workspace->level) {
+ if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
@@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws)
}
}
- set_bit(workspace->level - 1, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ set_bit(workspace->level, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
workspace->req_level = 0;
spin_unlock_bh(&wsm.lock);
- if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
cond_wake_up(&wsm.wait);
}
@@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(int level)
{
struct workspace *workspace;
@@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level)
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = zstd_ws_mem_sizes[level - 1];
- workspace->level = level;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ workspace->size = zstd_ws_mem_sizes[clip_level(level)];
+ workspace->level = clip_level(level);
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
@@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- unsigned int pg_off;
unsigned int cur_len;
- zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
- len);
+ workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
*out_folios = 0;
*total_out = 0;
*total_in = 0;
/* Initialize the stream */
- stream = zstd_init_cstream(&params, len, workspace->mem,
+ stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
struct btrfs_inode *inode = BTRFS_I(mapping->host);
@@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
@@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ workspace->in_buf.src = kmap_local_folio(in_folio,
+ offset_in_page(start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
}
@@ -717,6 +722,7 @@ finish:
const struct btrfs_compress_op btrfs_zstd_compress = {
/* ZSTD uses own workspace manager */
.workspace_manager = NULL,
+ .min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index cc8452f60251..194eacbefc95 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2361,9 +2361,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head, *prev = NULL;
size_t blocksize;
- int nr, i;
int fully_mapped = 1;
bool page_error = false;
loff_t limit = i_size_read(inode);
@@ -2372,16 +2371,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
limit = inode->i_sb->s_maxbytes;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
iblock = div_u64(folio_pos(folio), blocksize);
lblock = div_u64(limit + blocksize - 1, blocksize);
bh = head;
- nr = 0;
- i = 0;
do {
if (buffer_uptodate(bh))
@@ -2398,7 +2393,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
page_error = true;
}
if (!buffer_mapped(bh)) {
- folio_zero_range(folio, i * blocksize,
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
if (!err)
set_buffer_uptodate(bh);
@@ -2411,40 +2406,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (buffer_uptodate(bh))
continue;
}
- arr[nr++] = bh;
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
-
- if (fully_mapped)
- folio_set_mappedtodisk(folio);
-
- if (!nr) {
- /*
- * All buffers are uptodate or get_block() returned an
- * error when trying to map them - we can finish the read.
- */
- folio_end_read(folio, !page_error);
- return 0;
- }
- /* Stage two: lock the buffers */
- for (i = 0; i < nr; i++) {
- bh = arr[i];
lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
mark_buffer_async_read(bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ prev = bh;
+ } while (iblock++, (bh = bh->b_this_page) != head);
+
+ if (fully_mapped)
+ folio_set_mappedtodisk(folio);
/*
- * Stage 3: start the IO. Check for uptodateness
- * inside the buffer lock in case another process reading
- * the underlying blockdev brought it uptodate (the sct fix).
+ * All buffers are uptodate or get_block() returned an error
+ * when trying to map them - we must finish the read because
+ * end_buffer_async_read() will never be called on any buffer
+ * in this folio.
*/
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (buffer_uptodate(bh))
- end_buffer_async_read(bh, 1);
- else
- submit_bh(REQ_OP_READ, bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ else
+ folio_end_read(folio, !page_error);
+
return 0;
}
EXPORT_SYMBOL(block_read_full_folio);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7cf59713f0f7..83a60126de0f 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -128,18 +128,19 @@ retry:
ret = security_path_mkdir(&path, subdir, 0700);
if (ret < 0)
goto mkdir_error;
- ret = cachefiles_inject_write_error();
- if (ret == 0)
- ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- if (ret < 0) {
+ subdir = ERR_PTR(cachefiles_inject_write_error());
+ if (!IS_ERR(subdir))
+ subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
+ ret = PTR_ERR(subdir);
+ if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
goto mkdir_error;
}
trace_cachefiles_mkdir(dir, subdir);
- if (unlikely(d_unhashed(subdir))) {
- cachefiles_put_directory(subdir);
+ if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
+ dput(subdir);
goto retry;
}
ASSERT(d_backing_inode(subdir));
@@ -195,7 +196,8 @@ mark_error:
mkdir_error:
inode_unlock(d_inode(dir));
- dput(subdir);
+ if (!IS_ERR(subdir))
+ dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index fe3de9ad57bf..d9bc67176128 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_free_id;
}
- anon_file->file = anon_inode_getfile("[cachefiles]",
- &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ anon_file->file = anon_inode_getfile_fmode("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object,
+ O_WRONLY, FMODE_PWRITE | FMODE_LSEEK);
if (IS_ERR(anon_file->file)) {
ret = PTR_ERR(anon_file->file);
goto err_put_fd;
@@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_put_file;
}
- anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
-
load = (void *)req->msg.data;
load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5224a566b69..29be367905a1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct inode *inode = mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
@@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
return false;
}
+ atomic64_inc(&mdsc->dirty_folios);
+
ci = ceph_inode(inode);
/* dirty the head */
@@ -568,7 +571,36 @@ struct ceph_writeback_ctl
u64 truncate_size;
u32 truncate_seq;
bool size_stable;
+
bool head_snapc;
+ struct ceph_snap_context *snapc;
+ struct ceph_snap_context *last_snapc;
+
+ bool done;
+ bool should_loop;
+ bool range_whole;
+ pgoff_t start_index;
+ pgoff_t index;
+ pgoff_t end;
+ xa_mark_t tag;
+
+ pgoff_t strip_unit_end;
+ unsigned int wsize;
+ unsigned int nr_folios;
+ unsigned int max_pages;
+ unsigned int locked_pages;
+
+ int op_idx;
+ int num_ops;
+ u64 offset;
+ u64 len;
+
+ struct folio_batch fbatch;
+ unsigned int processed_in_fbatch;
+
+ bool from_pool;
+ struct page **pages;
+ struct page **data_pages;
};
/*
@@ -666,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * Write a single page, but leave the page locked.
+ * Write a folio, but leave it locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
+ * dirty page accounting (i.e., folio is no longer dirty).
*/
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+static int write_folio_nounlock(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
- loff_t page_off = page_offset(page);
+ loff_t page_off = folio_pos(folio);
int err;
- loff_t len = thp_size(page);
+ loff_t len = folio_size(folio);
loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -689,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
- doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
- page->index);
+ doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
+ folio->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
/* verify this is a writeable snap context */
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc) {
- doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
- page);
+ doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
+ folio);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), folio, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return 0;
}
ceph_put_snap_context(oldest);
@@ -726,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
- doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+ doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -740,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return PTR_ERR(req);
}
if (wlen < len)
len = wlen;
- set_page_writeback(page);
+ folio_start_writeback(folio);
if (caching)
- ceph_set_page_fscache(page);
+ ceph_set_page_fscache(&folio->page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
- bounce_page = fscrypt_encrypt_pagecache_blocks(page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
CEPH_FSCRYPT_BLOCK_SIZE, 0,
GFP_NOFS);
if (IS_ERR(bounce_page)) {
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
ceph_osdc_put_request(req);
return PTR_ERR(bounce_page);
}
}
/* it may be a short write due to an object boundary */
- WARN_ON_ONCE(len > thp_size(page));
+ WARN_ON_ONCE(len > folio_size(folio));
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
@@ -791,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
doutc(cl, "%llx.%llx interrupted page %p\n",
- ceph_vinop(inode), page);
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ ceph_vinop(inode), folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
- ceph_vinop(inode), err, page);
+ doutc(cl, "%llx.%llx setting mapping error %d %p\n",
+ ceph_vinop(inode), err, folio);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
doutc(cl, "%llx.%llx cleaned page %p\n",
- ceph_vinop(inode), page);
+ ceph_vinop(inode), folio);
err = 0; /* vfs expects us to return 0 */
}
- oldest = detach_page_private(page);
+ oldest = folio_detach_private(folio);
WARN_ON_ONCE(oldest != snapc);
- end_page_writeback(page);
+ folio_end_writeback(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
@@ -820,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
return err;
}
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err;
- struct inode *inode = page->mapping->host;
- BUG_ON(!inode);
- ihold(inode);
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_fs_client(inode)->write_congested) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
-
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
-
- err = writepage_nounlock(page, wbc);
- if (err == -ERESTARTSYS) {
- /* direct memory reclaimer was killed by SIGKILL. return 0
- * to prevent caller from setting mapping/page error */
- err = 0;
- }
- unlock_page(page);
- iput(inode);
- return err;
-}
-
/*
* async writeback completion handler.
*
@@ -865,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req)
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
unsigned int len = 0;
bool remove_page;
@@ -920,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
+
+ if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) {
+ wake_up_all(&mdsc->flush_end_wq);
+ WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0);
+ }
+
doutc(cl, "unlocking %p\n", page);
if (remove_page)
@@ -949,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
+static inline
+bool is_forced_umount(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
- struct ceph_vino vino = ceph_vino(inode);
- pgoff_t index, start_index, end = -1;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
- struct folio_batch fbatch;
- int rc = 0;
- unsigned int wsize = i_blocksize(inode);
- struct ceph_osd_request *req = NULL;
- struct ceph_writeback_ctl ceph_wbc;
- bool should_loop, range_whole = false;
- bool done = false;
- bool caching = ceph_is_cache_enabled(inode);
- xa_mark_t tag;
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fsc->write_congested)
- return 0;
-
- doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
@@ -987,388 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping,
ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
- return -EIO; /* we're in a forced umount, don't write! */
+ return true;
}
+
+ return false;
+}
+
+static inline
+unsigned int ceph_define_write_size(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ unsigned int wsize = i_blocksize(inode);
+
if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- folio_batch_init(&fbatch);
+ return wsize;
+}
+
+static inline
+void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_init(&ceph_wbc->fbatch);
+ ceph_wbc->processed_in_fbatch = 0;
+}
+
+static inline
+void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_release(&ceph_wbc->fbatch);
+ ceph_folio_batch_init(ceph_wbc);
+}
+
+static inline
+void ceph_init_writeback_ctl(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ ceph_wbc->snapc = NULL;
+ ceph_wbc->last_snapc = NULL;
+
+ ceph_wbc->strip_unit_end = 0;
+ ceph_wbc->wsize = ceph_define_write_size(mapping);
- start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
- index = start_index;
+ ceph_wbc->nr_folios = 0;
+ ceph_wbc->max_pages = 0;
+ ceph_wbc->locked_pages = 0;
+
+ ceph_wbc->done = false;
+ ceph_wbc->should_loop = false;
+ ceph_wbc->range_whole = false;
+
+ ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- tag = PAGECACHE_TAG_TOWRITE;
+ ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
} else {
- tag = PAGECACHE_TAG_DIRTY;
+ ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
}
-retry:
+
+ ceph_wbc->op_idx = -1;
+ ceph_wbc->num_ops = 0;
+ ceph_wbc->offset = 0;
+ ceph_wbc->len = 0;
+ ceph_wbc->from_pool = false;
+
+ ceph_folio_batch_init(ceph_wbc);
+
+ ceph_wbc->pages = NULL;
+ ceph_wbc->data_pages = NULL;
+}
+
+static inline
+int ceph_define_writeback_range(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+
/* find oldest snap context with dirty data */
- snapc = get_oldest_context(inode, &ceph_wbc, NULL);
- if (!snapc) {
+ ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL);
+ if (!ceph_wbc->snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
doutc(cl, " no snap context with dirty data?\n");
- goto out;
+ return -ENODATA;
}
- doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
- snapc->seq, snapc->num_snaps);
- should_loop = false;
- if (ceph_wbc.head_snapc && snapc != last_snapc) {
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
+ ceph_wbc->snapc, ceph_wbc->snapc->seq,
+ ceph_wbc->snapc->num_snaps);
+
+ ceph_wbc->should_loop = false;
+
+ if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
/* where to start/end? */
if (wbc->range_cyclic) {
- index = start_index;
- end = -1;
- if (index > 0)
- should_loop = true;
- doutc(cl, " cyclic, start at %lu\n", index);
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
+ doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
+ ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = true;
- doutc(cl, " not cyclic, %lu to %lu\n", index, end);
+ ceph_wbc->range_whole = true;
+ doutc(cl, " not cyclic, %lu to %lu\n",
+ ceph_wbc->index, ceph_wbc->end);
}
- } else if (!ceph_wbc.head_snapc) {
+ } else if (!ceph_wbc->head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
* in that range can be associated with newer snapc.
* They are not writeable until we write all dirty pages
* associated with 'snapc' get written */
- if (index > 0)
- should_loop = true;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
doutc(cl, " non-head snapc, range whole\n");
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
+ ceph_put_snap_context(ceph_wbc->last_snapc);
+ ceph_wbc->last_snapc = ceph_wbc->snapc;
- ceph_put_snap_context(last_snapc);
- last_snapc = snapc;
+ return 0;
+}
- while (!done && index <= end) {
- int num_ops = 0, op_idx;
- unsigned i, nr_folios, max_pages, locked_pages = 0;
- struct page **pages = NULL, **data_pages;
- struct page *page;
- pgoff_t strip_unit_end = 0;
- u64 offset = 0, len = 0;
- bool from_pool = false;
+static inline
+bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
+}
- max_pages = wsize >> PAGE_SHIFT;
+static inline
+bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned index)
+{
+ return index < ceph_wbc->nr_folios &&
+ ceph_wbc->locked_pages < ceph_wbc->max_pages;
+}
-get_more_pages:
- nr_folios = filemap_get_folios_tag(mapping, &index,
- end, tag, &fbatch);
- doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
- if (!nr_folios && !locked_pages)
- break;
- for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- struct folio *folio = fbatch.folios[i];
+static
+int ceph_check_page_before_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_snap_context *pgsnapc;
- page = &folio->page;
- doutc(cl, "? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
- else if (!trylock_page(page))
- break;
+ /* only dirty folios, or our accounting breaks */
+ if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) {
+ doutc(cl, "!dirty or !mapping %p\n", folio);
+ return -ENODATA;
+ }
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- doutc(cl, "!dirty or !mapping %p\n", page);
- unlock_page(page);
- continue;
- }
- /* only if matching snap context */
- pgsnapc = page_snap_context(page);
- if (pgsnapc != snapc) {
- doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- if (!should_loop &&
- !ceph_wbc.head_snapc &&
- wbc->sync_mode != WB_SYNC_NONE)
- should_loop = true;
- unlock_page(page);
- continue;
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(&folio->page);
+ if (pgsnapc != ceph_wbc->snapc) {
+ doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq,
+ ceph_wbc->snapc, ceph_wbc->snapc->seq);
+
+ if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ ceph_wbc->should_loop = true;
+
+ return -ENODATA;
+ }
+
+ if (folio_pos(folio) >= ceph_wbc->i_size) {
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc->i_size);
+
+ if ((ceph_wbc->size_stable ||
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0, folio_size(folio));
+
+ return -ENODATA;
+ }
+
+ if (ceph_wbc->strip_unit_end &&
+ (folio->index > ceph_wbc->strip_unit_end)) {
+ doutc(cl, "end of strip unit %p\n", folio);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static inline
+void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned int max_pages)
+{
+ ceph_wbc->pages = kmalloc_array(max_pages,
+ sizeof(*ceph_wbc->pages),
+ GFP_NOFS);
+ if (!ceph_wbc->pages) {
+ ceph_wbc->from_pool = true;
+ ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
+ BUG_ON(!ceph_wbc->pages);
+ }
+}
+
+static inline
+void ceph_allocate_page_array(struct address_space *mapping,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objnum;
+ u64 objoff;
+ u32 xlen;
+
+ /* prepare async write request */
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ ceph_wbc->offset, ceph_wbc->wsize,
+ &objnum, &objoff, &xlen);
+
+ ceph_wbc->num_ops = 1;
+ ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT);
+
+ BUG_ON(ceph_wbc->pages);
+ ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen);
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages);
+
+ ceph_wbc->len = 0;
+}
+
+static inline
+bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
+ const struct folio *folio)
+{
+ return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
+}
+
+static inline
+bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->num_ops >=
+ (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
+}
+
+static inline
+bool is_write_congestion_happened(struct ceph_fs_client *fsc)
+{
+ return atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
+}
+
+static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct page **pages = ceph_wbc->pages;
+ unsigned int index = ceph_wbc->locked_pages;
+ gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
+
+ if (IS_ENCRYPTED(inode)) {
+ pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
+ PAGE_SIZE,
+ 0,
+ gfp_flags);
+ if (IS_ERR(pages[index])) {
+ if (PTR_ERR(pages[index]) == -EINVAL) {
+ pr_err_client(cl, "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
}
- if (page_offset(page) >= ceph_wbc.i_size) {
- doutc(cl, "folio at %lu beyond eof %llu\n",
- folio->index, ceph_wbc.i_size);
- if ((ceph_wbc.size_stable ||
- folio_pos(folio) >= i_size_read(inode)) &&
- folio_clear_dirty_for_io(folio))
- folio_invalidate(folio, 0,
- folio_size(folio));
+
+ /* better not fail on first page! */
+ BUG_ON(ceph_wbc->locked_pages == 0);
+
+ pages[index] = NULL;
+ return PTR_ERR(pages[index]);
+ }
+ } else {
+ pages[index] = &folio->page;
+ }
+
+ ceph_wbc->locked_pages++;
+
+ return 0;
+}
+
+static
+int ceph_process_folio_batch(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct folio *folio = NULL;
+ unsigned i;
+ int rc = 0;
+
+ for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) {
+ folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
+ "folio_test_dirty %#x, folio_test_locked %#x\n",
+ folio, folio->index, folio_test_writeback(folio),
+ folio_test_dirty(folio),
+ folio_test_locked(folio));
+
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
+ continue;
+ }
+
+ if (ceph_wbc->locked_pages == 0)
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
+ break;
+
+ rc = ceph_check_page_before_write(mapping, wbc,
+ ceph_wbc, folio);
+ if (rc == -ENODATA) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ } else if (rc == -E2BIG) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ break;
+ }
+
+ if (!folio_clear_dirty_for_io(folio)) {
+ doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ }
+
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * calculate max possible write size and
+ * allocate a page array
+ */
+ if (ceph_wbc->locked_pages == 0) {
+ ceph_allocate_page_array(mapping, ceph_wbc, folio);
+ } else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
+ if (is_num_ops_too_big(ceph_wbc)) {
+ folio_redirty_for_writepage(wbc, folio);
folio_unlock(folio);
- continue;
- }
- if (strip_unit_end && (page->index > strip_unit_end)) {
- doutc(cl, "end of strip unit %p\n", page);
- unlock_page(page);
break;
}
- if (folio_test_writeback(folio) ||
- folio_test_private_2(folio) /* [DEPRECATED] */) {
- if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", folio);
- folio_unlock(folio);
- continue;
- }
- doutc(cl, "waiting on writeback %p\n", folio);
- folio_wait_writeback(folio);
- folio_wait_private_2(folio); /* [DEPRECATED] */
- }
- if (!clear_page_dirty_for_io(page)) {
- doutc(cl, "%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
- continue;
- }
+ ceph_wbc->num_ops++;
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_wbc->len = 0;
+ }
- /*
- * We have something to write. If this is
- * the first locked page this time through,
- * calculate max possinle write size and
- * allocate a page array
- */
- if (locked_pages == 0) {
- u64 objnum;
- u64 objoff;
- u32 xlen;
-
- /* prepare async write request */
- offset = (u64)page_offset(page);
- ceph_calc_file_object_mapping(&ci->i_layout,
- offset, wsize,
- &objnum, &objoff,
- &xlen);
- len = xlen;
-
- num_ops = 1;
- strip_unit_end = page->index +
- ((len - 1) >> PAGE_SHIFT);
-
- BUG_ON(pages);
- max_pages = calc_pages_for(0, (u64)len);
- pages = kmalloc_array(max_pages,
- sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
- }
-
- len = 0;
- } else if (page->index !=
- (offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
-
- num_ops++;
- offset = (u64)page_offset(page);
- len = 0;
- }
+ /* note position of first page in fbatch */
+ doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
+ ceph_vinop(inode), folio, folio->index);
- /* note position of first page in fbatch */
- doutc(cl, "%llx.%llx will write page %p idx %lu\n",
- ceph_vinop(inode), page, page->index);
-
- if (atomic_long_inc_return(&fsc->writeback_count) >
- CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb))
- fsc->write_congested = true;
-
- if (IS_ENCRYPTED(inode)) {
- pages[locked_pages] =
- fscrypt_encrypt_pagecache_blocks(page,
- PAGE_SIZE, 0,
- locked_pages ? GFP_NOWAIT : GFP_NOFS);
- if (IS_ERR(pages[locked_pages])) {
- if (PTR_ERR(pages[locked_pages]) == -EINVAL)
- pr_err_client(cl,
- "inode->i_blkbits=%hhu\n",
- inode->i_blkbits);
- /* better not fail on first page! */
- BUG_ON(locked_pages == 0);
- pages[locked_pages] = NULL;
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
- ++locked_pages;
- } else {
- pages[locked_pages++] = page;
- }
+ fsc->write_congested = is_write_congestion_happened(fsc);
- fbatch.folios[i] = NULL;
- len += thp_size(page);
+ rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
+ folio);
+ if (rc) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ break;
}
- /* did we get anything? */
- if (!locked_pages)
- goto release_folios;
- if (i) {
- unsigned j, n = 0;
- /* shift unused page to beginning of fbatch */
- for (j = 0; j < nr_folios; j++) {
- if (!fbatch.folios[j])
- continue;
- if (n < j)
- fbatch.folios[n] = fbatch.folios[j];
- n++;
- }
- fbatch.nr = n;
+ ceph_wbc->fbatch.folios[i] = NULL;
+ ceph_wbc->len += folio_size(folio);
+ }
- if (nr_folios && i == nr_folios &&
- locked_pages < max_pages) {
- doutc(cl, "reached end fbatch, trying for more\n");
- folio_batch_release(&fbatch);
- goto get_more_pages;
- }
+ ceph_wbc->processed_in_fbatch = i;
+
+ return rc;
+}
+
+static inline
+void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
+{
+ unsigned j, n = 0;
+
+ /* shift unused page to beginning of fbatch */
+ for (j = 0; j < folio_batch_count(fbatch); j++) {
+ if (!fbatch->folios[j])
+ continue;
+
+ if (n < j) {
+ fbatch->folios[n] = fbatch->folios[j];
}
+ n++;
+ }
+
+ fbatch->nr = n;
+}
+
+static
+int ceph_submit_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct ceph_osd_request *req = NULL;
+ struct page *page = NULL;
+ bool caching = ceph_is_cache_enabled(inode);
+ u64 offset;
+ u64 len;
+ unsigned i;
+
new_request:
- offset = ceph_fscrypt_page_offset(pages[0]);
- len = wsize;
+ offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]);
+ len = ceph_wbc->wsize;
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, ceph_wbc->num_ops,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc, ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size, false);
+ if (IS_ERR(req)) {
req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0, num_ops,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, false);
- if (IS_ERR(req)) {
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0,
- min(num_ops,
- CEPH_OSD_SLAB_OPS),
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, true);
- BUG_ON(IS_ERR(req));
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(ceph_wbc->num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc,
+ ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size,
+ true);
+ BUG_ON(IS_ERR(req));
+ }
+
+ page = ceph_wbc->pages[ceph_wbc->locked_pages - 1];
+ BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) {
+ struct folio *folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ page = &folio->page;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
- thp_size(pages[locked_pages - 1]) - offset);
- if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
- rc = -EIO;
- goto release_folios;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+
+ if (!page)
+ continue;
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- req->r_callback = writepages_finish;
- req->r_inode = inode;
-
- /* Format the osd request message and submit the write */
- len = 0;
- data_pages = pages;
- op_idx = 0;
- for (i = 0; i < locked_pages; i++) {
- struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
-
- u64 cur_offset = page_offset(page);
- /*
- * Discontinuity in page range? Ceph can handle that by just passing
- * multiple extents in the write op.
- */
- if (offset + len != cur_offset) {
- /* If it's full, stop here */
- if (op_idx + 1 == req->r_num_ops)
- break;
-
- /* Kick off an fscache write with what we have so far. */
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- /* Start a new extent */
- osd_req_op_extent_dup_last(req, op_idx,
- cur_offset - offset);
- doutc(cl, "got pages at %llu~%llu\n", offset,
- len);
- osd_req_op_extent_osd_data_pages(req, op_idx,
- data_pages, len, 0,
- from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- len = 0;
- offset = cur_offset;
- data_pages = pages + i;
- op_idx++;
- }
- set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
- len += thp_size(page);
+ ceph_osdc_put_request(req);
+ return -EIO;
+ }
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ ceph_wbc->op_idx = 0;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ u64 cur_offset;
+
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+ cur_offset = page_offset(page);
+
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
+ if (offset + len != cur_offset) {
+ /* If it's full, stop here */
+ if (ceph_wbc->op_idx + 1 == req->r_num_ops)
+ break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
+ osd_req_op_extent_dup_last(req, ceph_wbc->op_idx,
+ cur_offset - offset);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages,
+ len, 0,
+ ceph_wbc->from_pool,
+ false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ len = 0;
+ offset = cur_offset;
+ ceph_wbc->data_pages = ceph_wbc->pages + i;
+ ceph_wbc->op_idx++;
}
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- if (ceph_wbc.size_stable) {
- len = min(len, ceph_wbc.i_size - offset);
- } else if (i == locked_pages) {
- /* writepages_finish() clears writeback pages
- * according to the data length, so make sure
- * data length covers all locked pages */
- u64 min_len = len + 1 - thp_size(page);
- len = get_writepages_data_length(inode, pages[i - 1],
- offset);
- len = max(len, min_len);
+
+ set_page_writeback(page);
+
+ if (caching)
+ ceph_set_page_fscache(page);
+
+ len += thp_size(page);
+ }
+
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ if (ceph_wbc->size_stable) {
+ len = min(len, ceph_wbc->i_size - offset);
+ } else if (i == ceph_wbc->locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - thp_size(page);
+ len = get_writepages_data_length(inode,
+ ceph_wbc->pages[i - 1],
+ offset);
+ len = max(len, min_len);
+ }
+
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
+ }
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages, len,
+ 0, ceph_wbc->from_pool, false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops);
+
+ ceph_wbc->from_pool = false;
+ if (i < ceph_wbc->locked_pages) {
+ BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
+ ceph_wbc->num_ops -= req->r_num_ops;
+ ceph_wbc->locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages);
+ memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ memset(ceph_wbc->data_pages + i, 0,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ } else {
+ BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
+ /* request message now owns the pages array */
+ ceph_wbc->pages = NULL;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ req = NULL;
+
+ wbc->nr_to_write -= i;
+ if (ceph_wbc->pages)
+ goto new_request;
+
+ return 0;
+}
+
+static
+void ceph_wait_until_current_writes_complete(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct page *page;
+ unsigned i, nr;
+
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ ceph_wbc->start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc->head_snapc) {
+ ceph_wbc->index = 0;
+
+ while ((ceph_wbc->index <= ceph_wbc->end) &&
+ (nr = filemap_get_folios_tag(mapping,
+ &ceph_wbc->index,
+ (pgoff_t)-1,
+ PAGECACHE_TAG_WRITEBACK,
+ &ceph_wbc->fbatch))) {
+ for (i = 0; i < nr; i++) {
+ page = &ceph_wbc->fbatch.folios[i]->page;
+ if (page_snap_context(page) != ceph_wbc->snapc)
+ continue;
+ wait_on_page_writeback(page);
+ }
+
+ folio_batch_release(&ceph_wbc->fbatch);
+ cond_resched();
}
- if (IS_ENCRYPTED(inode))
- len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_writeback_ctl ceph_wbc;
+ int rc = 0;
- doutc(cl, "got pages at %llu~%llu\n", offset, len);
+ if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
+ return 0;
- if (IS_ENCRYPTED(inode) &&
- ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
- pr_warn_client(cl,
- "bad encrypted write offset=%lld len=%llu\n",
- offset, len);
-
- osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- BUG_ON(op_idx + 1 != req->r_num_ops);
-
- from_pool = false;
- if (i < locked_pages) {
- BUG_ON(num_ops <= req->r_num_ops);
- num_ops -= req->r_num_ops;
- locked_pages -= i;
-
- /* allocate new pages array for next request */
- data_pages = pages;
- pages = kmalloc_array(locked_pages, sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ if (is_forced_umount(mapping)) {
+ /* we're in a forced umount, don't write! */
+ return -EIO;
+ }
+
+ ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto out;
+ }
+
+retry:
+ rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
+ if (rc == -ENODATA) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ rc = 0;
+ goto dec_osd_stopping_blocker;
+ }
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
+
+ while (!has_writeback_done(&ceph_wbc)) {
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
+
+get_more_pages:
+ ceph_folio_batch_reinit(&ceph_wbc);
+
+ ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
+ &ceph_wbc.index,
+ ceph_wbc.end,
+ ceph_wbc.tag,
+ &ceph_wbc.fbatch);
+ doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
+ ceph_wbc.tag, ceph_wbc.nr_folios);
+
+ if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
+ break;
+
+process_folio_batch:
+ rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ /* did we get anything? */
+ if (!ceph_wbc.locked_pages)
+ goto release_folios;
+
+ if (ceph_wbc.processed_in_fbatch) {
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
+
+ if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
+ ceph_wbc.locked_pages < ceph_wbc.max_pages) {
+ doutc(cl, "reached end fbatch, trying for more\n");
+ goto get_more_pages;
}
- memcpy(pages, data_pages + i,
- locked_pages * sizeof(*pages));
- memset(data_pages + i, 0,
- locked_pages * sizeof(*pages));
- } else {
- BUG_ON(num_ops != req->r_num_ops);
- index = pages[i - 1]->index + 1;
- /* request message now owns the pages array */
- pages = NULL;
}
- req->r_mtime = inode_get_mtime(inode);
- ceph_osdc_start_request(&fsc->client->osdc, req);
- req = NULL;
+ rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.strip_unit_end = 0;
- wbc->nr_to_write -= i;
- if (pages)
- goto new_request;
+ if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
+ ceph_wbc.nr_folios =
+ folio_batch_count(&ceph_wbc.fbatch);
+ goto process_folio_batch;
+ }
/*
* We stop writing back only if we are not doing
@@ -1377,61 +1713,44 @@ new_request:
* we tagged for writeback prior to entering this loop.
*/
if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
- done = true;
+ ceph_wbc.done = true;
release_folios:
doutc(cl, "folio_batch release on %d folios (%p)\n",
- (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
- folio_batch_release(&fbatch);
+ (int)ceph_wbc.fbatch.nr,
+ ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
+ folio_batch_release(&ceph_wbc.fbatch);
}
- if (should_loop && !done) {
+ if (ceph_wbc.should_loop && !ceph_wbc.done) {
/* more to do; loop back to beginning of file */
doutc(cl, "looping back to beginning of file\n");
- end = start_index - 1; /* OK even when start_index == 0 */
+ /* OK even when start_index == 0 */
+ ceph_wbc.end = ceph_wbc.start_index - 1;
/* to write dirty pages associated with next snapc,
* we need to wait until current writes complete */
- if (wbc->sync_mode != WB_SYNC_NONE &&
- start_index == 0 && /* all dirty pages were checked */
- !ceph_wbc.head_snapc) {
- struct page *page;
- unsigned i, nr;
- index = 0;
- while ((index <= end) &&
- (nr = filemap_get_folios_tag(mapping, &index,
- (pgoff_t)-1,
- PAGECACHE_TAG_WRITEBACK,
- &fbatch))) {
- for (i = 0; i < nr; i++) {
- page = &fbatch.folios[i]->page;
- if (page_snap_context(page) != snapc)
- continue;
- wait_on_page_writeback(page);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
- }
+ ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
- start_index = 0;
- index = 0;
+ ceph_wbc.start_index = 0;
+ ceph_wbc.index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = ceph_wbc.index;
+
+dec_osd_stopping_blocker:
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
out:
- ceph_osdc_put_request(req);
- ceph_put_snap_context(last_snapc);
+ ceph_put_snap_context(ceph_wbc.last_snapc);
doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
rc);
+
return rc;
}
-
-
/*
* See if a given @snapc is either writeable, or already written.
*/
@@ -1447,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode,
/**
* ceph_find_incompatible - find an incompatible context and return it
- * @page: page being dirtied
+ * @folio: folio being dirtied
*
- * We are only allowed to write into/dirty a page if the page is
+ * We are only allowed to write into/dirty a folio if the folio is
* clean, or already dirty within the same snap context. Returns a
* conflicting context if there is one, NULL if there isn't, or a
* negative error code on other errors.
*
- * Must be called with page lock held.
+ * Must be called with folio lock held.
*/
static struct ceph_snap_context *
-ceph_find_incompatible(struct page *page)
+ceph_find_incompatible(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (ceph_inode_is_shutdown(inode)) {
- doutc(cl, " %llx.%llx page %p is shutdown\n",
- ceph_vinop(inode), page);
+ doutc(cl, " %llx.%llx folio %p is shutdown\n",
+ ceph_vinop(inode), folio);
return ERR_PTR(-ESTALE);
}
for (;;) {
struct ceph_snap_context *snapc, *oldest;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc || snapc == ci->i_head_snapc)
break;
/*
- * this page is already dirty in another (older) snap
+ * this folio is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), folio, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
- /* yay, writeable, do it now (without dropping page lock) */
- doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
- ceph_vinop(inode), page, snapc);
- if (clear_page_dirty_for_io(page)) {
- int r = writepage_nounlock(page, NULL);
+ /* yay, writeable, do it now (without dropping folio lock) */
+ doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), folio, snapc);
+ if (folio_clear_dirty_for_io(folio)) {
+ int r = write_folio_nounlock(folio, NULL);
if (r < 0)
return ERR_PTR(r);
}
@@ -1511,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(*foliop, 0));
+ snapc = ceph_find_incompatible(*foliop);
if (snapc) {
int r;
@@ -1594,7 +1913,6 @@ out:
const struct address_space_operations ceph_aops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
@@ -1602,6 +1920,7 @@ const struct address_space_operations ceph_aops = {
.invalidate_folio = ceph_invalidate_folio,
.release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
};
static void ceph_block_sigs(sigset_t *oldset)
@@ -1718,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_cap_flush *prealloc_cf;
- struct page *page = vmf->page;
- loff_t off = page_offset(page);
+ struct folio *folio = page_folio(vmf->page);
+ loff_t off = folio_pos(folio);
loff_t size = i_size_read(inode);
size_t len;
int want, got, err;
@@ -1736,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (off + thp_size(page) <= size)
- len = thp_size(page);
+ if (off + folio_size(folio) <= size)
+ len = folio_size(folio);
else
- len = offset_in_thp(page, size);
+ len = offset_in_folio(folio, size);
doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
ceph_vinop(inode), off, len, size);
@@ -1756,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
off, len, ceph_cap_string(got));
- /* Update time before taking page lock */
+ /* Update time before taking folio lock */
file_update_time(vma->vm_file);
inode_inc_iversion_raw(inode);
do {
struct ceph_snap_context *snapc;
- lock_page(page);
+ folio_lock(folio);
- if (page_mkwrite_check_truncate(page, inode) < 0) {
- unlock_page(page);
+ if (folio_mkwrite_check_truncate(folio, inode) < 0) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
break;
}
- snapc = ceph_find_incompatible(page);
+ snapc = ceph_find_incompatible(folio);
if (!snapc) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
+ /* success. we'll keep the folio locked. */
+ folio_mark_dirty(folio);
ret = VM_FAULT_LOCKED;
break;
}
- unlock_page(page);
+ folio_unlock(folio);
if (IS_ERR(snapc)) {
ret = VM_FAULT_SIGBUS;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 62e99e65250d..a321aa6d0ed2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
if (ptr_pos >= i_size_read(dir))
return NULL;
- if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) {
+ if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) {
ceph_readdir_cache_release(cache_ctl);
- cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
- if (!cache_ctl->page) {
- doutc(cl, " page %lu not found\n", ptr_pgoff);
+ cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff);
+ if (IS_ERR(cache_ctl->folio)) {
+ cache_ctl->folio = NULL;
+ doutc(cl, " folio %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_rwsem, no need to use page lock */
- unlock_page(cache_ctl->page);
- cache_ctl->dentries = kmap(cache_ctl->page);
+ i_rwsem, no need to use folio lock */
+ folio_unlock(cache_ctl->folio);
+ cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0);
}
cache_ctl->index = idx & idx_mask;
@@ -1092,19 +1093,20 @@ out:
return err;
}
-static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
+ struct dentry *ret;
int err;
int op;
err = ceph_wait_on_conflict_unlink(dentry);
if (err)
- return err;
+ return ERR_PTR(err);
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
@@ -1116,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
- err = -EROFS;
+ ret = ERR_PTR(-EROFS);
goto out;
}
if (op == CEPH_MDS_OP_MKDIR &&
ceph_quota_is_max_files_exceeded(dir)) {
- err = -EDQUOT;
+ ret = ERR_PTR(-EDQUOT);
goto out;
}
if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
!fscrypt_has_encryption_key(dir)) {
- err = -ENOKEY;
+ ret = ERR_PTR(-ENOKEY);
goto out;
}
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
- err = PTR_ERR(req);
+ ret = ERR_CAST(req);
goto out;
}
mode |= S_IFDIR;
req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
if (IS_ERR(req->r_new_inode)) {
- err = PTR_ERR(req->r_new_inode);
+ ret = ERR_CAST(req->r_new_inode);
req->r_new_inode = NULL;
goto out_req;
}
@@ -1165,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
!req->r_reply_info.head->is_target &&
!req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+ ret = ERR_PTR(err);
out_req:
+ if (!IS_ERR(ret) && req->r_dentry != dentry)
+ /* Some other dentry was spliced in */
+ ret = dget(req->r_dentry);
ceph_mdsc_put_request(req);
out:
- if (!err)
+ if (!IS_ERR(ret)) {
+ if (ret)
+ dentry = ret;
ceph_init_inode_acls(d_inode(dentry), &as_ctx);
- else
+ } else {
d_drop(dentry);
+ }
ceph_release_acl_sec_ctx(&as_ctx);
- return err;
+ return ret;
}
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7dd6c2275085..6ac2bd555e86 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
- if (ctl->page) {
- kunmap(ctl->page);
- put_page(ctl->page);
- ctl->page = NULL;
+ if (ctl->folio) {
+ folio_release_kmap(ctl->folio, ctl->dentries);
+ ctl->folio = NULL;
}
}
@@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
- if (!ctl->page || pgoff != ctl->page->index) {
+ if (!ctl->folio || pgoff != ctl->folio->index) {
ceph_readdir_cache_release(ctl);
+ fgf_t fgf = FGP_LOCK;
+
if (idx == 0)
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
- else
- ctl->page = find_lock_page(&dir->i_data, pgoff);
- if (!ctl->page) {
+ fgf |= FGP_ACCESSED | FGP_CREAT;
+
+ ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
+ fgf, mapping_gfp_mask(&dir->i_data));
+ if (IS_ERR(ctl->folio)) {
+ int err = PTR_ERR(ctl->folio);
+
+ ctl->folio = NULL;
ctl->index = -1;
- return idx == 0 ? -ENOMEM : 0;
+ return idx == 0 ? err : 0;
}
/* reading/filling the cache are serialized by
- * i_rwsem, no need to use page lock */
- unlock_page(ctl->page);
- ctl->dentries = kmap(ctl->page);
+ * i_rwsem, no need to use folio lock */
+ folio_unlock(ctl->folio);
+ ctl->dentries = kmap_local_folio(ctl->folio, 0);
if (idx == 0)
memset(ctl->dentries, 0, PAGE_SIZE);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 54b3421501e9..230e0c3f341f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->stopping_lock);
atomic_set(&mdsc->stopping_blockers, 0);
init_completion(&mdsc->stopping_waiter);
+ atomic64_set(&mdsc->dirty_folios, 0);
+ init_waitqueue_head(&mdsc->flush_end_wq);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->quotarealms_inodes = RB_ROOT;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7c9fee9e80d4..3e2a6fa7c19a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -458,6 +458,9 @@ struct ceph_mds_client {
atomic_t stopping_blockers;
struct completion stopping_waiter;
+ atomic64_t dirty_folios;
+ wait_queue_head_t flush_end_wq;
+
atomic64_t quotarealms_count; /* # realms with quota */
/*
* We keep a list of inodes we don't see in the mountpoint but that we
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4344e1f11806..f3951253e393 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s)
*/
sync_filesystem(s);
+ if (atomic64_read(&mdsc->dirty_folios) > 0) {
+ wait_queue_head_t *wq = &mdsc->flush_end_wq;
+ long timeleft = wait_event_killable_timeout(*wq,
+ atomic64_read(&mdsc->dirty_folios) <= 0,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
+ }
+
spin_lock(&mdsc->stopping_lock);
mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
wait = !!atomic_read(&mdsc->stopping_blockers);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7fa1e7be50e4..bb0db0cc8003 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf)
}
struct ceph_readdir_cache_control {
- struct page *page;
+ struct folio *folio;
struct dentry **dentries;
int index;
};
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a3e2dfeedfbf..ab69d8f0cec2 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -166,8 +166,8 @@ err_out:
return error;
}
-static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *de, umode_t mode)
+static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *de, umode_t mode)
{
struct inode *inode;
struct coda_vattr attrs;
@@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct CodaFid newfid;
if (is_root_inode(dir) && coda_iscontrol(name, len))
- return -EPERM;
+ return ERR_PTR(-EPERM);
attrs.va_mode = mode;
- error = venus_mkdir(dir->i_sb, coda_i2f(dir),
+ error = venus_mkdir(dir->i_sb, coda_i2f(dir),
name, len, &newfid, &attrs);
if (error)
goto err_out;
-
+
inode = coda_iget(dir->i_sb, &newfid, &attrs);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
@@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
coda_dir_inc_nlink(dir);
coda_dir_update_mtime(dir);
d_instantiate(de, inode);
- return 0;
+ return NULL;
err_out:
d_drop(de);
- return error;
+ return ERR_PTR(error);
}
/* try to make de an entry in dir_inodde linked to source_de */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7d10278db30d..5568cb74b322 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1280,8 +1280,8 @@ out_root_unlock:
}
EXPORT_SYMBOL(configfs_depend_item_unlocked);
-static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int ret = 0;
int module_got = 0;
@@ -1461,7 +1461,7 @@ out_put:
put_fragment(frag);
out:
- return ret;
+ return ERR_PTR(ret);
}
static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/coredump.c b/fs/coredump.c
index 4375c70144d0..c33c177a701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
{
unsigned long addr;
struct page *dump_page;
+ int locked, ret;
dump_page = dump_page_alloc();
if (!dump_page)
return 0;
+ ret = 0;
+ locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
+ if (!locked) {
+ if (mmap_read_lock_killable(current->mm))
+ goto out;
+ locked = 1;
+ }
+
/*
* To avoid having to allocate page tables for virtual address
* ranges that have never been used yet, and also to make it
@@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
* NULL when encountering an empty page table entry that would
* otherwise have been filled with the zero page.
*/
- page = get_dump_page(addr);
+ page = get_dump_page(addr, &locked);
if (page) {
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop) {
- dump_page_free(dump_page);
- return 0;
- }
+ if (stop)
+ goto out;
} else {
dump_skip(cprm, PAGE_SIZE);
}
+
+ if (dump_interrupted())
+ goto out;
+
+ if (!need_resched())
+ continue;
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
cond_resched();
}
+ ret = 1;
+out:
+ if (locked)
+ mmap_read_unlock(current->mm);
+
dump_page_free(dump_page);
- return 1;
+ return ret;
}
#endif
@@ -1016,7 +1042,9 @@ static const struct ctl_table coredump_sysctls[] = {
.data = &core_pipe_limit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "core_file_note_size_limit",
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 5aff5934baa1..b5dfb0aa405a 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -3,6 +3,7 @@ config FS_ENCRYPTION
bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_HASH
+ select CRYPTO_HKDF
select CRYPTO_SKCIPHER
select CRYPTO_LIB_SHA256
select KEYS
@@ -24,20 +25,16 @@ config FS_ENCRYPTION
#
# Also note that this option only pulls in the generic implementations of the
# algorithms, not any per-architecture optimized implementations. It is
-# strongly recommended to enable optimized implementations too. It is safe to
-# disable these generic implementations if corresponding optimized
-# implementations will always be available too; for this reason, these are soft
-# dependencies ('imply' rather than 'select'). Only disable these generic
-# implementations if you're sure they will never be needed, though.
+# strongly recommended to enable optimized implementations too.
config FS_ENCRYPTION_ALGS
tristate
- imply CRYPTO_AES
- imply CRYPTO_CBC
- imply CRYPTO_CTS
- imply CRYPTO_ECB
- imply CRYPTO_HMAC
- imply CRYPTO_SHA512
- imply CRYPTO_XTS
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_CTS
+ select CRYPTO_ECB
+ select CRYPTO_HMAC
+ select CRYPTO_SHA512
+ select CRYPTO_XTS
config FS_ENCRYPTION_INLINE_CRYPT
bool "Enable fscrypt to use inline crypto"
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 328470d40dec..b74b5937e695 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -153,8 +153,8 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
- * @page: the locked pagecache page containing the data to encrypt
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio
+ * @folio: the locked pagecache folio containing the data to encrypt
* @len: size of the data to encrypt, in bytes
* @offs: offset within @page of the data to encrypt, in bytes
* @gfp_flags: memory allocation flags; see details below
@@ -177,23 +177,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
*
* Return: the new encrypted bounce page on success; an ERR_PTR() on failure
*/
-struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
- unsigned int len,
- unsigned int offs,
- gfp_t gfp_flags)
-
+struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
+ size_t len, size_t offs, gfp_t gfp_flags)
{
- const struct inode *inode = page->mapping->host;
+ const struct inode *inode = folio->mapping->host;
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
- u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+ u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
(offs >> du_bits);
unsigned int i;
int err;
- if (WARN_ON_ONCE(!PageLocked(page)))
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ if (WARN_ON_ONCE(!folio_test_locked(folio)))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
@@ -205,7 +203,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
for (i = offs; i < offs + len; i += du_size, index++) {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
- page, ciphertext_page,
+ &folio->page, ciphertext_page,
du_size, i, gfp_flags);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
@@ -213,7 +211,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
}
}
SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)page);
+ set_page_private(ciphertext_page, (unsigned long)folio);
return ciphertext_page;
}
EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 5a384dad2c72..855a0f4b7318 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -1,9 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
- * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
- * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
- *
* This is used to derive keys from the fscrypt master keys.
*
* Copyright 2019 Google LLC
@@ -11,6 +7,7 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
+#include <crypto/hkdf.h>
#include "fscrypt_private.h"
@@ -44,20 +41,6 @@
* there's no way to persist a random salt per master key from kernel mode.
*/
-/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
-static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
- unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
-{
- static const u8 default_salt[HKDF_HASHLEN];
- int err;
-
- err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
- if (err)
- return err;
-
- return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
-}
-
/*
* Compute HKDF-Extract using the given master key as the input keying material,
* and prepare an HMAC transform object keyed by the resulting pseudorandom key.
@@ -69,6 +52,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
unsigned int master_key_size)
{
struct crypto_shash *hmac_tfm;
+ static const u8 default_salt[HKDF_HASHLEN];
u8 prk[HKDF_HASHLEN];
int err;
@@ -84,7 +68,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
goto err_free_tfm;
}
- err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
+ err = hkdf_extract(hmac_tfm, master_key, master_key_size,
+ default_salt, HKDF_HASHLEN, prk);
if (err)
goto err_free_tfm;
@@ -118,61 +103,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
u8 *okm, unsigned int okmlen)
{
SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
- u8 prefix[9];
- unsigned int i;
+ u8 *full_info;
int err;
- const u8 *prev = NULL;
- u8 counter = 1;
- u8 tmp[HKDF_HASHLEN];
-
- if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN))
- return -EINVAL;
+ full_info = kzalloc(infolen + 9, GFP_KERNEL);
+ if (!full_info)
+ return -ENOMEM;
desc->tfm = hkdf->hmac_tfm;
- memcpy(prefix, "fscrypt\0", 8);
- prefix[8] = context;
-
- for (i = 0; i < okmlen; i += HKDF_HASHLEN) {
-
- err = crypto_shash_init(desc);
- if (err)
- goto out;
-
- if (prev) {
- err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
- if (err)
- goto out;
- }
-
- err = crypto_shash_update(desc, prefix, sizeof(prefix));
- if (err)
- goto out;
-
- err = crypto_shash_update(desc, info, infolen);
- if (err)
- goto out;
-
- BUILD_BUG_ON(sizeof(counter) != 1);
- if (okmlen - i < HKDF_HASHLEN) {
- err = crypto_shash_finup(desc, &counter, 1, tmp);
- if (err)
- goto out;
- memcpy(&okm[i], tmp, okmlen - i);
- memzero_explicit(tmp, sizeof(tmp));
- } else {
- err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
- if (err)
- goto out;
- }
- counter++;
- prev = &okm[i];
- }
- err = 0;
-out:
- if (unlikely(err))
- memzero_explicit(okm, okmlen); /* so caller doesn't need to */
- shash_desc_zero(desc);
+ memcpy(full_info, "fscrypt\0", 8);
+ full_info[8] = context;
+ memcpy(full_info + 9, info, infolen);
+
+ err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9,
+ okm, okmlen);
+ kfree_sensitive(full_info);
return err;
}
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 40de69860dcf..7fa53d30aec3 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -130,6 +130,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
+ crypto_cfg.key_type = BLK_CRYPTO_KEY_TYPE_RAW;
devs = fscrypt_get_devices(sb, &num_devs);
if (IS_ERR(devs))
@@ -166,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
if (!blk_key)
return -ENOMEM;
- err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
+ err = blk_crypto_init_key(blk_key, raw_key, ci->ci_mode->keysize,
+ BLK_CRYPTO_KEY_TYPE_RAW, crypto_mode,
fscrypt_get_dun_bytes(ci),
1U << ci->ci_data_unit_bits);
if (err) {
diff --git a/fs/dax.c b/fs/dax.c
index 21b47402b3dc..7fd4cd9a51f2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-static s64 dax_unshare_iter(struct iomap_iter *iter)
+static int dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
u64 copy_len = iomap_length(iter);
u32 mod;
int id = 0;
- s64 ret = 0;
+ s64 ret;
void *daddr = NULL, *saddr = NULL;
if (!iomap_want_unshare_iter(iter))
- return iomap_length(iter);
+ return iomap_iter_advance_full(iter);
/*
* Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
- ret = iomap_length(iter);
- else
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
ret = -EIO;
out_unlock:
dax_read_unlock(id);
- return dax_mem2blk_err(ret);
+ if (ret < 0)
+ return dax_mem2blk_err(ret);
+ return iomap_iter_advance_full(iter);
}
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_unshare_iter(&iter);
+ iter.status = dax_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
return ret;
}
-static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
u64 length = iomap_length(iter);
- s64 written = 0;
+ int ret;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return length;
+ return iomap_iter_advance(iter, &length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
*/
if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
- pos >> PAGE_SHIFT,
- (pos + length - 1) >> PAGE_SHIFT);
+ iter->pos >> PAGE_SHIFT,
+ (iter->pos + length - 1) >> PAGE_SHIFT);
do {
+ loff_t pos = iter->pos;
unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- long rc;
int id;
+ length = min_t(u64, PAGE_SIZE - offset, length);
+
id = dax_read_lock();
- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
+ ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iter, pos, size);
+ ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
- if (rc < 0)
- return rc;
- pos += size;
- length -= size;
- written += size;
+ if (ret < 0)
+ return ret;
+
+ ret = iomap_iter_advance(iter, &length);
+ if (ret)
+ return ret;
} while (length > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return ret;
}
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_zero_iter(&iter, did_zero);
+ iter.status = dax_zero_iter(&iter, did_zero);
return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
-static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
- struct iov_iter *iter)
+static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (pos >= end)
return 0;
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
- return iov_iter_zero(min(length, end - pos), iter);
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
+ done = iov_iter_zero(min(length, end - pos), iter);
+ return iomap_iter_advance(iomi, &done);
+ }
}
/*
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
id = dax_read_lock();
- while (pos < end) {
+ while ((pos = iomi->pos) < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- pos += xfer;
- length -= xfer;
- done += xfer;
-
- if (xfer == 0)
+ length = xfer;
+ ret = iomap_iter_advance(iomi, &length);
+ if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
- return done ? done : ret;
+ return ret;
}
/**
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iomi, ops)) > 0)
- iomi.processed = dax_iomap_iter(&iomi, iter);
+ iomi.status = dax_iomap_iter(&iomi, iter);
done = iomi.pos - iocb->ki_pos;
iocb->ki_pos = iomi.pos;
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
while ((error = iomap_iter(&iter, ops)) > 0) {
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
- iter.processed = -EIO; /* fs corruption? */
+ iter.status = -EIO; /* fs corruption? */
continue;
}
@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR))
- iter.processed = PAGE_SIZE;
+ if (!(ret & VM_FAULT_ERROR)) {
+ u64 length = PAGE_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
if (iomap_errp)
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK)
- iter.processed = PMD_SIZE;
+ if (ret != VM_FAULT_FALLBACK) {
+ u64 length = PMD_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
unlock_entry:
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
-static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+static int dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
- return len;
+ goto advance;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (!*same)
len = 0;
dax_read_unlock(id);
- return len;
+
+advance:
+ dest_len = len;
+ ret = iomap_iter_advance(it_src, &len);
+ if (!ret)
+ ret = iomap_iter_advance(it_dest, &dest_len);
+ return ret;
out_unlock:
dax_read_unlock(id);
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret, compared = 0;
+ int ret, status;
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
- compared = dax_range_compare_iter(&src_iter, &dst_iter,
+ status = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
- if (compared < 0)
+ if (status < 0)
return ret;
- src_iter.processed = dst_iter.processed = compared;
+ src_iter.status = dst_iter.status = status;
}
return ret;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index e3634916ffb9..bd5aa136153a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -73,8 +73,13 @@
* If no ancestor relationship:
* arbitrary, since it's serialized on rename_lock
*/
-int sysctl_vfs_cache_pressure __read_mostly = 100;
-EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+static int sysctl_vfs_cache_pressure __read_mostly = 100;
+
+unsigned long vfs_pressure_ratio(unsigned long val)
+{
+ return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+}
+EXPORT_SYMBOL_GPL(vfs_pressure_ratio);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
@@ -211,8 +216,20 @@ static const struct ctl_table fs_dcache_sysctls[] = {
},
};
+static const struct ctl_table vm_dcache_sysctls[] = {
+ {
+ .procname = "vfs_cache_pressure",
+ .data = &sysctl_vfs_cache_pressure,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
static int __init init_fs_dcache_sysctls(void)
{
+ register_sysctl_init("vm", vm_dcache_sysctls);
register_sysctl_init("fs", fs_dcache_sysctls);
return 0;
}
@@ -2480,7 +2497,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
{
smp_store_release(&dir->i_dir_seq, n + 2);
preempt_enable_nested();
- wake_up_all(d_wait);
+ if (wq_has_sleeper(d_wait))
+ wake_up_all(d_wait);
}
static void d_wait_lookup(struct dentry *dentry)
@@ -2687,52 +2705,6 @@ void d_add(struct dentry *entry, struct inode *inode)
}
EXPORT_SYMBOL(d_add);
-/**
- * d_exact_alias - find and hash an exact unhashed alias
- * @entry: dentry to add
- * @inode: The inode to go with this dentry
- *
- * If an unhashed dentry with the same name/parent and desired
- * inode already exists, hash and return it. Otherwise, return
- * NULL.
- *
- * Parent directory should be locked.
- */
-struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
-{
- struct dentry *alias;
- unsigned int hash = entry->d_name.hash;
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
- /*
- * Don't need alias->d_lock here, because aliases with
- * d_parent == entry->d_parent are not subject to name or
- * parent changes, because the parent inode i_mutex is held.
- */
- if (alias->d_name.hash != hash)
- continue;
- if (alias->d_parent != entry->d_parent)
- continue;
- if (!d_same_name(alias, entry->d_parent, &entry->d_name))
- continue;
- spin_lock(&alias->d_lock);
- if (!d_unhashed(alias)) {
- spin_unlock(&alias->d_lock);
- alias = NULL;
- } else {
- dget_dlock(alias);
- __d_rehash(alias);
- spin_unlock(&alias->d_lock);
- }
- spin_unlock(&inode->i_lock);
- return alias;
- }
- spin_unlock(&inode->i_lock);
- return NULL;
-}
-EXPORT_SYMBOL(d_exact_alias);
-
static void swap_names(struct dentry *dentry, struct dentry *target)
{
if (unlikely(dname_external(target))) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1096ff8562fa..42e4d6eeb29f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -12,6 +12,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -21,7 +23,6 @@
#include <linux/magic.h>
#include <linux/idr.h>
#include <linux/devpts_fs.h>
-#include <linux/parser.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
@@ -87,14 +88,14 @@ enum {
Opt_err
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_ptmxmode, "ptmxmode=%o"},
- {Opt_newinstance, "newinstance"},
- {Opt_max, "max=%d"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec devpts_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_s32 ("max", Opt_max),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("newinstance", Opt_newinstance),
+ fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
};
struct pts_fs_info {
@@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi)
deactivate_super(fsi->sb);
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
-
/*
- * parse_mount_options():
- * Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value.
- *
- * Note: @data may be NULL (in which case all options are set to default).
+ * devpts_parse_param - Parse mount parameters
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- kuid_t uid;
- kgid_t gid;
-
- opts->setuid = 0;
- opts->setgid = 0;
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->mode = DEVPTS_DEFAULT_MODE;
- opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- opts->max = NR_UNIX98_PTY_MAX;
-
- /* Only allow instances mounted from the initial mount
- * namespace to tap the reserve pool of ptys.
- */
- if (op == PARSE_MOUNT)
- opts->reserve =
- (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
-
- while ((p = strsep(&data, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- int option;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return -EINVAL;
- opts->uid = uid;
- opts->setuid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return -EINVAL;
- opts->gid = gid;
- opts->setgid = 1;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
- case Opt_ptmxmode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->ptmxmode = option & S_IALLUGO;
- break;
- case Opt_newinstance:
- break;
- case Opt_max:
- if (match_int(&args[0], &option) ||
- option < 0 || option > NR_UNIX98_PTY_MAX)
- return -EINVAL;
- opts->max = option;
- break;
- default:
- pr_err("called with bogus options\n");
- return -EINVAL;
- }
+ struct pts_fs_info *fsi = fc->s_fs_info;
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, devpts_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->setuid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->setgid = 1;
+ break;
+ case Opt_mode:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_ptmxmode:
+ opts->ptmxmode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_newinstance:
+ break;
+ case Opt_max:
+ if (result.uint_32 > NR_UNIX98_PTY_MAX)
+ return invalf(fc, "max out of range");
+ opts->max = result.uint_32;
+ break;
}
return 0;
}
-static int mknod_ptmx(struct super_block *sb)
+static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
{
int mode;
int rc = -ENOMEM;
@@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
}
}
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+static int devpts_reconfigure(struct fs_context *fc)
{
- int err;
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
- struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb);
+ struct pts_fs_info *new = fc->s_fs_info;
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ /* Apply the revised options. We don't want to change ->reserve.
+ * Ideally, we'd update each option conditionally on it having been
+ * explicitly changed, but the default is to reset everything so that
+ * would break UAPI...
+ */
+ fsi->mount_opts.setuid = new->mount_opts.setuid;
+ fsi->mount_opts.setgid = new->mount_opts.setgid;
+ fsi->mount_opts.uid = new->mount_opts.uid;
+ fsi->mount_opts.gid = new->mount_opts.gid;
+ fsi->mount_opts.mode = new->mount_opts.mode;
+ fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode;
+ fsi->mount_opts.max = new->mount_opts.max;
/*
* parse_mount_options() restores options to default values
@@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
*/
update_ptmx_mode(fsi);
- return err;
+ return 0;
}
static int devpts_show_options(struct seq_file *seq, struct dentry *root)
@@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
- .remount_fs = devpts_remount,
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(struct super_block *sb)
-{
- struct pts_fs_info *fsi;
-
- fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
- if (!fsi)
- return NULL;
-
- ida_init(&fsi->allocated_ptys);
- fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
- fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- fsi->sb = sb;
-
- return fsi;
-}
-
-static int
-devpts_fill_super(struct super_block *s, void *data, int silent)
+static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct pts_fs_info *fsi = DEVPTS_SB(s);
struct inode *inode;
- int error;
s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
@@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_d_op = &simple_dentry_operations;
s->s_time_gran = 1;
+ fsi->sb = s;
- error = -ENOMEM;
- s->s_fs_info = new_pts_fs_info(s);
- if (!s->s_fs_info)
- goto fail;
-
- error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto fail;
-
- error = -ENOMEM;
inode = new_inode(s);
if (!inode)
- goto fail;
+ return -ENOMEM;
inode->i_ino = 1;
simple_inode_init_ts(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_root = d_make_root(inode);
if (!s->s_root) {
pr_err("get root dentry failed\n");
- goto fail;
+ return -ENOMEM;
}
- error = mknod_ptmx(s);
- if (error)
- goto fail_dput;
-
- return 0;
-fail_dput:
- dput(s->s_root);
- s->s_root = NULL;
-fail:
- return error;
+ return mknod_ptmx(s, fc);
}
/*
- * devpts_mount()
+ * devpts_get_tree()
*
* Mount a new (private) instance of devpts. PTYs created in this
* instance are independent of the PTYs in other devpts instances.
*/
-static struct dentry *devpts_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int devpts_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, devpts_fill_super);
+}
+
+static void devpts_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations devpts_context_ops = {
+ .free = devpts_free_fc,
+ .parse_param = devpts_parse_param,
+ .get_tree = devpts_get_tree,
+ .reconfigure = devpts_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int devpts_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, devpts_fill_super);
+ struct pts_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ ida_init(&fsi->allocated_ptys);
+ fsi->mount_opts.uid = GLOBAL_ROOT_UID;
+ fsi->mount_opts.gid = GLOBAL_ROOT_GID;
+ fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+ fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->mount_opts.max = NR_UNIX98_PTY_MAX;
+
+ if (fc->purpose == FS_CONTEXT_FOR_MOUNT &&
+ current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+ fsi->mount_opts.reserve = true;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &devpts_context_ops;
+ return 0;
}
static void devpts_kill_sb(struct super_block *sb)
@@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb)
static struct file_system_type devpts_fs_type = {
.name = "devpts",
- .mount = devpts_mount,
+ .init_fs_context = devpts_init_fs_context,
+ .parameters = devpts_param_specs,
.kill_sb = devpts_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index e48c4f9686d3..13a3d0b26194 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -23,7 +23,7 @@ struct dlm_config_node {
extern const struct rhashtable_params dlm_rhash_rsb_params;
-#define DLM_MAX_ADDR_COUNT 3
+#define DLM_MAX_ADDR_COUNT 8
#define DLM_PROTO_TCP 0
#define DLM_PROTO_SCTP 1
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c8ff88f1cdcf..e01d5f29f4d2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -741,6 +741,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
read_lock_bh(&ls->ls_rsbtbl_lock);
if (!rsb_flag(r, RSB_HASHED)) {
read_unlock_bh(&ls->ls_rsbtbl_lock);
+ error = -EBADR;
goto do_new;
}
@@ -784,6 +785,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
}
} else {
write_unlock_bh(&ls->ls_rsbtbl_lock);
+ error = -EBADR;
goto do_new;
}
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8afac6e2dff0..1929327ffbe1 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -576,7 +576,7 @@ static int new_lockspace(const char *name, const char *cluster,
lockspace to start running (via sysfs) in dlm_ls_start(). */
error = do_uevent(ls, 1);
- if (error)
+ if (error < 0)
goto out_recoverd;
/* wait until recovery is successful or failed */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d28141829c05..70abd4da17a6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1826,8 +1826,8 @@ static int dlm_tcp_listen_validate(void)
{
/* We don't support multi-homed hosts */
if (dlm_local_count > 1) {
- log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
- return -EINVAL;
+ log_print("Detect multi-homed hosts but use only the first IP address.");
+ log_print("Try SCTP, if you want to enable multi-link.");
}
return 0;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d45ef541d848..019a8b4eaaf9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -14,7 +14,7 @@
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
-int sysctl_drop_caches;
+static int sysctl_drop_caches;
static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
@@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}
-int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
+static int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret;
@@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
}
return 0;
}
+
+static const struct ctl_table drop_caches_table[] = {
+ {
+ .procname = "drop_caches",
+ .data = &sysctl_drop_caches,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = drop_caches_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_FOUR,
+ },
+};
+
+static int __init init_vm_drop_caches_sysctls(void)
+{
+ register_sysctl_init("vm", drop_caches_table);
+ return 0;
+}
+fs_initcall(init_vm_drop_caches_sysctls);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a9819ddb1ab8..51a5c54eb740 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -503,18 +503,24 @@ out_lock:
return rc;
}
-static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
struct inode *lower_dir;
rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_mkdir(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode);
- if (rc || d_really_is_negative(lower_dentry))
+ if (rc)
+ goto out;
+
+ lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
+ lower_dentry, mode);
+ rc = PTR_ERR(lower_dentry);
+ if (IS_ERR(lower_dentry))
+ goto out;
+ rc = 0;
+ if (d_unhashed(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
@@ -526,7 +532,7 @@ out:
inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
- return rc;
+ return ERR_PTR(rc);
}
static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0b1c878317ab..e7b7f426fecf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = {
.destroy_inode = ecryptfs_destroy_inode,
.free_inode = ecryptfs_free_inode,
.statfs = ecryptfs_statfs,
- .remount_fs = NULL,
.evict_inode = ecryptfs_evict_inode,
.show_options = ecryptfs_show_options
};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..331e49cd1b8d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -13,12 +13,12 @@ config EROFS_FS
smartphones with Android OS, LiveCDs and high-density hosts with
numerous containers;
- It also provides fixed-sized output compression support in order to
- improve storage density as well as keep relatively higher compression
- ratios and implements in-place decompression to reuse the file page
- for compressed data temporarily with proper strategies, which is
- quite useful to ensure guaranteed end-to-end runtime decompression
- performance under extremely memory pressure without extra cost.
+ It also provides transparent compression and deduplication support to
+ improve storage density and maintain relatively high compression
+ ratios, and it implements in-place decompression to temporarily reuse
+ page cache for compressed data using proper strategies, which is
+ quite useful for ensuring guaranteed end-to-end runtime decompression
+ performance under extreme memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
and the web pages at <https://erofs.docs.kernel.org> for more details.
@@ -97,7 +97,7 @@ config EROFS_FS_ZIP
select LZ4_DECOMPRESS
default y
help
- Enable fixed-sized output compression for EROFS.
+ Enable transparent compression support for EROFS file systems.
If you don't want to enable compression feature, say N.
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 65ff39401020..2704d7a592a5 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,6 +11,7 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
+ unsigned int inpages, outpages;
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
@@ -59,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[];
struct z_erofs_stream_dctx {
struct z_erofs_decompress_req *rq;
- unsigned int inpages, outpages; /* # of {en,de}coded pages */
int no, ni; /* the current {en,de}coded page # */
unsigned int avail_out; /* remaining bytes in the decoded buffer */
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..2409d2ab0c28 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -25,8 +25,7 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
pgoff_t index = offset >> PAGE_SHIFT;
struct folio *folio = NULL;
@@ -43,10 +42,10 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
return folio;
}
buf->page = folio_file_page(folio, index);
- if (!buf->base && type == EROFS_KMAP)
- buf->base = kmap_local_page(buf->page);
- if (type == EROFS_NO_KMAP)
+ if (!need_kmap)
return NULL;
+ if (!buf->base)
+ buf->base = kmap_local_page(buf->page);
return buf->base + (offset & ~PAGE_MASK);
}
@@ -65,64 +64,47 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type)
+ erofs_off_t offset, bool need_kmap)
{
erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, offset, type);
-}
-
-static int erofs_map_blocks_flatmode(struct inode *inode,
- struct erofs_map_blocks *map)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- struct super_block *sb = inode->i_sb;
- bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
-
- map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
- if (map->m_la < erofs_pos(sb, lastblk)) {
- map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
- } else {
- DBG_BUGON(!tailendpacking);
- map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, map->m_la);
- map->m_plen = inode->i_size - map->m_la;
-
- /* inline data should be located in the same meta block */
- if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- map->m_flags |= EROFS_MAP_META;
- }
- return 0;
+ return erofs_bread(buf, offset, need_kmap);
}
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
{
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = inode->i_sb;
+ unsigned int unit, blksz = sb->s_blocksize;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u64 chunknr;
- unsigned int unit;
+ erofs_blk_t startblk, addrmask;
+ bool tailpacking;
erofs_off_t pos;
- void *kaddr;
+ u64 chunknr;
int err = 0;
trace_erofs_map_blocks_enter(inode, map, 0);
map->m_deviceid = 0;
- if (map->m_la >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = map->m_llen;
+ map->m_flags = 0;
+ if (map->m_la >= inode->i_size)
goto out;
- }
if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
- err = erofs_map_blocks_flatmode(inode, map);
+ tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
+ goto out;
+ pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);
+
+ map->m_flags = EROFS_MAP_MAPPED;
+ if (map->m_la < pos) {
+ map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
+ map->m_llen = pos - map->m_la;
+ } else {
+ map->m_pa = erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_llen = inode->i_size - map->m_la;
+ map->m_flags |= EROFS_MAP_META;
+ }
goto out;
}
@@ -135,45 +117,44 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ idx = erofs_read_metabuf(&buf, sb, pos, true);
+ if (IS_ERR(idx)) {
+ err = PTR_ERR(idx);
goto out;
}
map->m_la = chunknr << vi->chunkbits;
- map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
- round_up(inode->i_size - map->m_la, sb->s_blocksize));
-
- /* handle block map */
- if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr;
-
- if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
- map->m_flags = 0;
- } else {
- map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr));
+ map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ round_up(inode->i_size - map->m_la, blksz));
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
+ addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
+ BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
+ startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
+ le32_to_cpu(idx->startblk_lo)) & addrmask;
+ if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = erofs_pos(sb, startblk);
+ map->m_flags = EROFS_MAP_MAPPED;
+ }
+ } else {
+ startblk = le32_to_cpu(*(__le32 *)idx);
+ if (startblk != (u32)EROFS_NULL_ADDR) {
+ map->m_pa = erofs_pos(sb, startblk);
map->m_flags = EROFS_MAP_MAPPED;
}
- goto out_unlock;
- }
- /* parse chunk indexes */
- idx = kaddr;
- switch (le32_to_cpu(idx->blkaddr)) {
- case EROFS_NULL_ADDR:
- map->m_flags = 0;
- break;
- default:
- map->m_deviceid = le16_to_cpu(idx->device_id) &
- EROFS_SB(sb)->device_id_mask;
- map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr));
- map->m_flags = EROFS_MAP_MAPPED;
- break;
}
-out_unlock:
erofs_put_metabuf(&buf);
out:
- if (!err)
- map->m_llen = map->m_plen;
+ if (!err) {
+ map->m_plen = map->m_llen;
+ /* inline data should be located in the same meta block */
+ if ((map->m_flags & EROFS_MAP_META) &&
+ erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ }
trace_erofs_map_blocks_exit(inode, map, 0, err);
return err;
}
@@ -192,7 +173,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
- erofs_off_t startoff, length;
+ erofs_off_t startoff;
int id;
erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
@@ -205,7 +186,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return -ENODEV;
}
if (devs->flatdev) {
- map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
+ map->m_pa += erofs_pos(sb, dif->uniaddr);
up_read(&devs->rwsem);
return 0;
}
@@ -214,13 +195,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- if (!dif->mapped_blkaddr)
+ if (!dif->uniaddr)
continue;
- startoff = erofs_pos(sb, dif->mapped_blkaddr);
- length = erofs_pos(sb, dif->blocks);
+ startoff = erofs_pos(sb, dif->uniaddr);
if (map->m_pa >= startoff &&
- map->m_pa < startoff + length) {
+ map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
map->m_pa -= startoff;
erofs_fill_from_devinfo(map, sb, dif);
break;
@@ -312,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2b123b070a42..bf62e2836b60 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -9,14 +9,6 @@
#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-struct z_erofs_lz4_decompress_ctx {
- struct z_erofs_decompress_req *rq;
- /* # of encoded, decoded pages */
- unsigned int inpages, outpages;
- /* decoded block total length (used for in-place decompression) */
- unsigned int oend;
-};
-
static int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
@@ -55,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
* Fill all gaps with bounce pages if it's a sparse page list. Also check if
* all physical pages are consecutive, which can be seen for moderate CR.
*/
-static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -68,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < ctx->outpages; ++i, ++j) {
+ for (i = j = 0; i < rq->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -114,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq,
void *inpage, void *out, unsigned int *inputmargin,
int *maptype, bool may_inplace)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
- unsigned int omargin, total, i;
+ unsigned int oend, omargin, total, i;
struct page **in;
void *src, *tmp;
if (rq->inplace_io) {
- omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ oend = rq->pageofs_out + rq->outputsize;
+ omargin = PAGE_ALIGN(oend) - oend;
if (rq->partial_decoding || !may_inplace ||
omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < ctx->inpages; ++i)
- if (rq->out[ctx->outpages - ctx->inpages + i] !=
+ for (i = 0; i < rq->inpages; ++i)
+ if (rq->out[rq->outpages - rq->inpages + i] !=
rq->in[i])
goto docopy;
kunmap_local(inpage);
*maptype = 3;
- return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
+ return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
}
- if (ctx->inpages <= 1) {
+ if (rq->inpages <= 1) {
*maptype = 0;
return inpage;
}
kunmap_local(inpage);
- src = erofs_vm_map_ram(rq->in, ctx->inpages);
+ src = erofs_vm_map_ram(rq->in, rq->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
*maptype = 1;
@@ -152,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = z_erofs_get_gbuf(ctx->inpages);
+ src = z_erofs_get_gbuf(rq->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
@@ -197,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
return 0;
}
-static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
- u8 *dst)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *out, *headpage, *src;
@@ -224,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
inputmargin = rq->pageofs_in;
- src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
+ src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin,
&maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -251,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
if (maptype == 0) {
kunmap_local(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, ctx->inpages);
+ vm_unmap_ram(src, rq->inpages);
} else if (maptype == 2) {
z_erofs_put_gbuf(src);
} else if (maptype != 3) {
@@ -264,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
- ctx.rq = rq;
- ctx.oend = rq->pageofs_out + rq->outputsize;
- ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
- ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
-
/* one optimized fast path only for non bigpcluster cases yet */
- if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
dst = kmap_local_page(*rq->out);
dst_maptype = 0;
- goto dstmap_out;
- }
-
- /* general decoding path which can be used for all cases */
- ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- dst = page_address(*rq->out);
- dst_maptype = 1;
} else {
- dst = erofs_vm_map_ram(rq->out, ctx.outpages);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, rq->outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
+ }
}
-
-dstmap_out:
- ret = z_erofs_lz4_decompress_mem(&ctx, dst);
+ ret = z_erofs_lz4_decompress_mem(rq, dst);
if (!dst_maptype)
kunmap_local(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, ctx.outpages);
+ vm_unmap_ram(dst, rq->outpages);
return ret;
}
static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages;
const unsigned int bs = rq->sb->s_blocksize;
unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
u8 *kin;
@@ -336,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
rq->outputsize -= cur;
}
- for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) {
insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
rq->outputsize -= insz;
if (!rq->in[ni])
@@ -373,7 +350,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
unsigned int j;
if (!dctx->avail_out) {
- if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ if (++dctx->no >= rq->outpages || !rq->outputsize) {
erofs_err(sb, "insufficient space for decompressed data");
return -EFSCORRUPTED;
}
@@ -401,7 +378,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
}
if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
- if (++dctx->ni >= dctx->inpages) {
+ if (++dctx->ni >= rq->inpages) {
erofs_err(sb, "invalid compressed data");
return -EFSCORRUPTED;
}
@@ -434,7 +411,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
dctx->bounced = true;
}
- for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ for (j = dctx->ni + 1; j < rq->inpages; ++j) {
if (rq->out[dctx->no] != rq->in[j])
continue;
tmppage = erofs_allocpage(pgpl, rq->gfp);
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 5070d2fcc737..c6908a487054 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -101,13 +101,7 @@ static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct z_erofs_deflate *strm;
int zerr, err;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 40666815046f..832cffb83a66 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct xz_buf buf = {};
struct z_erofs_lzma *strm;
enum xz_ret xz_err;
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index 7e177304967e..b4bfe14229f9 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
zstd_in_buffer in_buf = { NULL, 0, 0 };
zstd_out_buffer out_buf = { NULL, 0, 0 };
struct z_erofs_zstd *strm;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index c3b90abdee37..2fae209d0274 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -58,9 +58,9 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, dbstart, EROFS_KMAP);
+ de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
- erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
+ erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
@@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
ofs = 0;
}
erofs_put_metabuf(&buf);
+ if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
+ if (!dir_emit_dot(f, ctx))
+ return 0;
+ ++ctx->pos;
+ }
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 199395ed1c1f..9581e9bf8192 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -30,25 +30,19 @@
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
+#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
- EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
- EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
- EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
- EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
- EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
- EROFS_FEATURE_INCOMPAT_DEDUPE | \
- EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
+ ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
u8 tag[64]; /* digest(sha256), etc. */
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
- u8 reserved[56];
+ __le32 blocks_lo; /* total blocks count of this device */
+ __le32 uniaddr_lo; /* unified starting block of this device */
+ __le32 blocks_hi; /* total blocks count MSB */
+ __le16 uniaddr_hi; /* unified starting block MSB */
+ u8 reserved[50];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
@@ -59,13 +53,14 @@ struct erofs_super_block {
__le32 feature_compat;
__u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
-
- __le16 root_nid; /* nid of root directory */
+ union {
+ __le16 rootnid_2b; /* nid of root directory */
+ __le16 blocks_hi; /* (48BIT on) blocks count MSB */
+ } rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
-
- __le64 build_time; /* compact inode time derivation */
- __le32 build_time_nsec; /* compact inode time derivation in ns scale */
- __le32 blocks; /* used for statfs */
+ __le64 epoch; /* base seconds used for compact inodes */
+ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
+ __le32 blocks_lo; /* blocks count LSB */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */
@@ -84,7 +79,10 @@ struct erofs_super_block {
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
__u8 xattr_filter_reserved; /* reserved for xattr name filter */
- __u8 reserved2[23];
+ __u8 reserved[3];
+ __le32 build_time; /* seconds added to epoch for mkfs time */
+ __le64 rootnid_8b; /* (48BIT on) nid of root directory */
+ __u8 reserved2[8];
};
/*
@@ -115,19 +113,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_MASK 0x01
#define EROFS_I_DATALAYOUT_MASK 0x07
-#define EROFS_I_VERSION_BIT 0
-#define EROFS_I_DATALAYOUT_BIT 1
-#define EROFS_I_ALL_BIT 4
-
-#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
+#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */
+#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
-/* with chunk indexes or just a 4-byte blkaddr array */
+/* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+#define EROFS_CHUNK_FORMAT_48BIT 0x0040
-#define EROFS_CHUNK_FORMAT_ALL \
- (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
@@ -140,45 +138,40 @@ struct erofs_inode_chunk_info {
};
union erofs_inode_i_u {
- /* total compressed blocks for compressed inodes */
- __le32 compressed_blocks;
-
- /* block address for uncompressed flat inodes */
- __le32 raw_blkaddr;
-
- /* for device files, used to indicate old/new device # */
- __le32 rdev;
-
- /* for chunk-based files, it contains the summary info */
+ __le32 blocks_lo; /* total blocks count (if compressed inodes) */
+ __le32 startblk_lo; /* starting block number (if flat inodes) */
+ __le32 rdev; /* device ID (if special inodes) */
struct erofs_inode_chunk_info c;
};
+union erofs_inode_i_nb {
+ __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
+ __le16 blocks_hi; /* total blocks count MSB */
+ __le16 startblk_hi; /* starting block number MSB */
+};
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_nlink;
+ union erofs_inode_i_nb i_nb;
__le32 i_size;
- __le32 i_reserved;
+ __le32 i_mtime;
union erofs_inode_i_u i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
__le16 i_gid;
- __le32 i_reserved2;
+ __le32 i_reserved;
};
/* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_reserved;
+ union erofs_inode_i_nb i_nb;
__le64 i_size;
union erofs_inode_i_u i_u;
@@ -248,6 +241,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
if (!i_xattr_icount)
return 0;
+ /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
return sizeof(struct erofs_xattr_ibody_header) +
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
}
@@ -266,11 +260,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
-/* 8-byte inode chunk indexes */
+/* 8-byte inode chunk index */
struct erofs_inode_chunk_index {
- __le16 advise; /* always 0, don't care for now */
+ __le16 startblk_hi; /* starting block number MSB */
__le16 device_id; /* back-end storage id (with bits masked) */
- __le32 blkaddr; /* start block address of this inode chunk */
+ __le32 startblk_lo; /* starting block number of this chunk */
};
/* dirent sorts in alphabet order, thus we can do binary search */
@@ -337,21 +331,20 @@ struct z_erofs_zstd_cfgs {
#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
/*
- * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
- * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
- * (4B) + 2B + (4B) if compacted 2B is on.
- * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
- * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
- * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
- * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
- * bit 5 : fragment pcluster (0 - off; 1 - on)
+ * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes:
+ * 4B (disabled) vs 4B+2B+4B (enabled)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */
+#define Z_EROFS_ADVISE_EXTENTS 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+/* Indicate the record size for each extent if extent metadata is used */
+#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1
+#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3
#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
@@ -363,45 +356,24 @@ struct z_erofs_map_header {
/* indicates the encoded size of tailpacking data */
__le16 h_idata_size;
};
+ __le32 h_extents_lo; /* extent count LSB */
};
__le16 h_advise;
- /*
- * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
- * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
- */
- __u8 h_algorithmtype;
- /*
- * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-6 : reserved;
- * bit 7 : move the whole file into packed inode or not.
- */
- __u8 h_clusterbits;
+ union {
+ struct {
+ /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-3 : logical cluster bits - blkszbits
+ * bit 4-6 : reserved
+ * bit 7 : pack the whole file into packed inode
+ */
+ __u8 h_clusterbits;
+ };
+ __le16 h_extents_hi; /* extent count MSB */
+ };
};
-/*
- * On-disk logical cluster type:
- * 0 - literal (uncompressed) lcluster
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * 2 - compressed lcluster (for NONHEAD lclusters)
- *
- * In detail,
- * 0 - literal (uncompressed) lcluster,
- * di_advise = 0
- * di_clusterofs = the literal data offset of the lcluster
- * di_blkaddr = the blkaddr of the literal pcluster
- *
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * di_advise = 1 or 3
- * di_clusterofs = the decompressed data offset of the lcluster
- * di_blkaddr = the blkaddr of the compressed pcluster
- *
- * 2 - compressed lcluster (for NONHEAD lclusters)
- * di_advise = 2
- * di_clusterofs =
- * the decompressed data offset in its own HEAD lcluster
- * di_u.delta[0] = distance to this HEAD lcluster
- * di_u.delta[1] = distance to the next HEAD lcluster
- */
enum {
Z_EROFS_LCLUSTER_TYPE_PLAIN = 0,
Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1,
@@ -415,11 +387,7 @@ enum {
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
-/*
- * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
- * compressed block count of a compressed extent (in logical clusters, aka.
- * block count of a pcluster).
- */
+/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */
#define Z_EROFS_LI_D0_CBLKCNT (1 << 11)
struct z_erofs_lcluster_index {
@@ -428,19 +396,36 @@ struct z_erofs_lcluster_index {
__le16 di_clusterofs;
union {
- /* for the HEAD lclusters */
- __le32 blkaddr;
+ __le32 blkaddr; /* for the HEAD lclusters */
/*
- * for the NONHEAD lclusters
* [0] - distance to its HEAD lcluster
* [1] - distance to the next HEAD lcluster
*/
- __le16 delta[2];
+ __le16 delta[2]; /* for the NONHEAD lclusters */
} di_u;
};
-#define Z_EROFS_FULL_INDEX_ALIGN(end) \
- (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8)
+#define Z_EROFS_MAP_HEADER_END(end) \
+ (ALIGN(end, 8) + sizeof(struct z_erofs_map_header))
+#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8)
+
+#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27)
+#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28
+#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1)
+struct z_erofs_extent {
+ __le32 plen; /* encoded length */
+ __le32 pstart_lo; /* physical offset */
+ __le32 pstart_hi; /* physical offset MSB */
+ __le32 lstart_lo; /* logical offset */
+ __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */
+ __u8 reserved[12]; /* for future use */
+};
+
+static inline int z_erofs_extent_recsize(unsigned int advise)
+{
+ return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) &
+ Z_EROFS_ADVISE_EXTRECSZ_MASK);
+}
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 0ffd1c63beeb..bec4b56b3826 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -112,7 +112,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
void *src;
src = erofs_read_metabuf(&buf, inode->i_sb,
- map->m_pa + ofs, EROFS_KMAP);
+ map->m_pa + ofs, true);
if (IS_ERR(src)) {
err = PTR_ERR(src);
break;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index ce3d8737df85..9c9129bca346 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -276,7 +276,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
size_t size = map.m_llen;
void *src;
- src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa, true);
if (IS_ERR(src))
return PTR_ERR(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..a0ae0b4f7b01 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
+ unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ erofs_blk_t addrmask = BIT_ULL(48) - 1;
struct erofs_inode *vi = EROFS_I(inode);
- const erofs_off_t inode_loc = erofs_iloc(inode);
- erofs_blk_t blkaddr, nblks = 0;
- void *kaddr;
+ struct erofs_inode_extended *die, copied;
struct erofs_inode_compact *dic;
- struct erofs_inode_extended *die, *copied = NULL;
- union erofs_inode_i_u iu;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- unsigned int ifmt, ofs;
+ unsigned int ifmt;
+ void *ptr;
int err = 0;
- blkaddr = erofs_blknr(sb, inode_loc);
- ofs = erofs_blkoff(sb, inode_loc);
-
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(kaddr));
- return PTR_ERR(kaddr);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inode (nid: %llu) page, err %d",
+ vi->nid, err);
+ goto err_out;
}
- dic = kaddr + ofs;
+ dic = ptr + ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
erofs_err(sb, "unsupported i_format %u of nid %llu",
@@ -73,40 +71,34 @@ static int erofs_read_inode(struct inode *inode)
if (ofs + vi->inode_isize <= sb->s_blocksize) {
ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
+ copied.i_u = die->i_u;
+ copied.i_nb = die->i_nb;
} else {
const unsigned int gotten = sb->s_blocksize - ofs;
- copied = kmalloc(vi->inode_isize, GFP_KERNEL);
- if (!copied) {
- err = -ENOMEM;
+ memcpy(&copied, dic, gotten);
+ ptr = erofs_read_metabuf(&buf, sb,
+ erofs_pos(sb, blkaddr + 1), true);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d",
+ vi->nid, err);
goto err_out;
}
- memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
- EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
- vi->nid, PTR_ERR(kaddr));
- kfree(copied);
- return PTR_ERR(kaddr);
- }
ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, ofs);
- die = copied;
+ memcpy((u8 *)&copied + gotten, ptr, ofs);
+ die = &copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
- /* each extended inode has its own timestamp */
- inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ inode_set_mtime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
- kfree(copied);
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
@@ -114,12 +106,20 @@ static int erofs_read_inode(struct inode *inode)
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- iu = dic->i_u;
+ copied.i_u = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
- set_nlink(inode, le16_to_cpu(dic->i_nlink));
- /* use build time for compact inodes */
- inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
+ if (!S_ISDIR(inode->i_mode) &&
+ ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) {
+ set_nlink(inode, 1);
+ copied.i_nb = dic->i_nb;
+ } else {
+ set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
+ copied.i_nb.startblk_hi = 0;
+ addrmask = BIT_ULL(32) - 1;
+ }
+ inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime),
+ sbi->fixed_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
break;
@@ -136,19 +136,26 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
case S_IFDIR:
+ vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1;
+ fallthrough;
+ case S_IFREG:
case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) |
+ ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32);
+ if (vi->datalayout == EROFS_INODE_FLAT_PLAIN &&
+ !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask))
+ vi->startblk = EROFS_NULL_ADDR;
+
if(S_ISLNK(inode->i_mode)) {
- err = erofs_fill_symlink(inode, kaddr, ofs);
+ err = erofs_fill_symlink(inode, ptr, ofs);
if (err)
goto err_out;
}
break;
case S_IFCHR:
case S_IFBLK:
- inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev));
break;
case S_IFIFO:
case S_IFSOCK:
@@ -161,12 +168,15 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout)) {
- nblks = le32_to_cpu(iu.compressed_blocks);
- } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) <<
+ (sb->s_blocksize_bits - 9);
+ else
+ inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
/* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(iu.c.format);
+ vi->chunkformat = le16_to_cpu(copied.i_u.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
@@ -176,22 +186,15 @@ static int erofs_read_inode(struct inode *inode)
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode_set_mtime_to_ts(inode,
- inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
+ inode_set_atime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
(vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
-
- if (!nblks)
- /* measure inode.i_blocks as generic filesystems */
- inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
- else
- inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
err_out:
- DBG_BUGON(err);
erofs_put_metabuf(&buf);
return err;
}
@@ -202,13 +205,10 @@ static int erofs_fill_inode(struct inode *inode)
int err;
trace_erofs_fill_inode(inode);
-
- /* read inode base data from disk */
err = erofs_read_inode(inode);
if (err)
return err;
- /* setup the new inode */
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
@@ -229,15 +229,10 @@ static int erofs_fill_inode(struct inode *inode)
inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
+ default:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
return 0;
- default:
- return -EFSCORRUPTED;
}
mapping_set_large_folios(inode->i_mapping);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 686d835eb533..4ac188d5d894 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -37,8 +37,7 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
typedef u64 erofs_nid_t;
typedef u64 erofs_off_t;
-/* data type for filesystem-wide blocks number */
-typedef u32 erofs_blk_t;
+typedef u64 erofs_blk_t;
struct erofs_device_info {
char *path;
@@ -47,8 +46,8 @@ struct erofs_device_info {
struct dax_device *dax_dev;
u64 dax_part_off;
- u32 blocks;
- u32 mapped_blkaddr;
+ erofs_blk_t blocks;
+ erofs_blk_t uniaddr;
};
enum {
@@ -143,8 +142,8 @@ struct erofs_sb_info {
unsigned char blkszbits; /* filesystem block size in bit shift */
u32 sb_size; /* total superblock size */
- u32 build_time_nsec;
- u64 build_time;
+ u32 fixed_nsec;
+ s64 epoch;
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
@@ -152,8 +151,6 @@ struct erofs_sb_info {
/* used for statfs, f_files - f_favail */
u64 inos;
- u8 uuid[16]; /* 128-bit uuid for volume */
- u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
@@ -199,11 +196,6 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-enum erofs_kmap_type {
- EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap_local_page() to map the buffer */
-};
-
struct erofs_buf {
struct address_space *mapping;
struct file *file;
@@ -212,8 +204,8 @@ struct erofs_buf {
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
-#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
+#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits))
+#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -233,6 +225,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
+EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
@@ -252,6 +245,7 @@ struct erofs_inode {
unsigned char datalayout;
unsigned char inode_isize;
+ bool dot_omitted;
unsigned int xattr_isize;
unsigned int xattr_name_filter;
@@ -259,7 +253,7 @@ struct erofs_inode {
unsigned int *xattr_shared_xattrs;
union {
- erofs_blk_t raw_blkaddr;
+ erofs_blk_t startblk;
struct {
unsigned short chunkformat;
unsigned char chunkbits;
@@ -268,15 +262,13 @@ struct erofs_inode {
struct {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
- unsigned char z_logical_clusterbits;
- unsigned long z_tailextent_headlcn;
+ unsigned char z_lclusterbits;
union {
- struct {
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
- };
- erofs_off_t z_fragmentoff;
+ u64 z_tailextent_headlcn;
+ u64 z_extents;
};
+ erofs_off_t z_fragmentoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -387,11 +379,10 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type);
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type);
+ erofs_off_t offset, bool need_kmap);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -448,6 +439,7 @@ int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
+int z_erofs_init_super(struct super_block *sb);
unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
@@ -457,7 +449,6 @@ void z_erofs_put_gbuf(void *ptr);
int z_erofs_gbuf_growsize(unsigned int nrpages);
int __init z_erofs_gbuf_init(void);
void z_erofs_gbuf_exit(void);
-int erofs_init_managed_cache(struct super_block *sb);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
@@ -466,7 +457,7 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_subsystem(void) { return 0; }
static inline void z_erofs_exit_subsystem(void) {}
-static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+static inline int z_erofs_init_super(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index c94d0c1608a8..f7cf4f41af28 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_dirent *de;
buf.mapping = dir->i_mapping;
- de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 827b62665649..cadec6b1b554 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -94,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr))
return ptr;
@@ -110,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
@@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_deviceslot *dis;
struct file *file;
- dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ dis = erofs_read_metabuf(buf, sb, *pos, true);
if (IS_ERR(dis))
return PTR_ERR(dis);
@@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
dif->file = file;
}
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ dif->blocks = le32_to_cpu(dis->blocks_lo);
+ dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
sbi->total_blocks += dif->blocks;
*pos += EROFS_DEVT_SLOT_SIZE;
return 0;
@@ -255,7 +255,7 @@ static int erofs_read_superblock(struct super_block *sb)
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ data = erofs_read_metabuf(&buf, sb, 0, true);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
@@ -268,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
- sbi->blkszbits = dsb->blkszbits;
+ sbi->blkszbits = dsb->blkszbits;
if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
goto out;
@@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -308,23 +308,20 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
- sbi->root_nid = le16_to_cpu(dsb->root_nid);
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+ sbi->dif0.blocks = (sbi->dif0.blocks << 32) |
+ le16_to_cpu(dsb->rb.blocks_hi);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
sbi->inos = le64_to_cpu(dsb->inos);
- sbi->build_time = le64_to_cpu(dsb->build_time);
- sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
-
+ sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
+ sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
- ret = strscpy(sbi->volume_name, dsb->volume_name,
- sizeof(dsb->volume_name));
- if (ret < 0) { /* -E2BIG */
- erofs_err(sb, "bad volume name without NIL terminator");
- ret = -EFSCORRUPTED;
- goto out;
- }
-
/* parse on-disk compression configurations */
ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
@@ -333,6 +330,8 @@ static int erofs_read_superblock(struct super_block *sb)
/* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
+ if (erofs_sb_has_48bit(sbi))
+ erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
@@ -639,9 +638,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
else
sb->s_flags &= ~SB_POSIXACL;
-#ifdef CONFIG_EROFS_FS_ZIP
- xa_init(&sbi->managed_pslots);
-#endif
+ err = z_erofs_init_super(sb);
+ if (err)
+ return err;
+
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->packed_inode = inode;
+ }
inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
@@ -653,24 +659,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
iput(inode);
return -EINVAL;
}
-
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
erofs_shrinker_register(sb);
- if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
- sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
- if (IS_ERR(sbi->packed_inode)) {
- err = PTR_ERR(sbi->packed_inode);
- sbi->packed_inode = NULL;
- return err;
- }
- }
- err = erofs_init_managed_cache(sb);
- if (err)
- return err;
-
err = erofs_xattr_prefixes_init(sb);
if (err)
return err;
@@ -806,6 +799,16 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
+static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
+{
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
+
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -815,6 +818,7 @@ static void erofs_kill_sb(struct super_block *sb)
kill_anon_super(sb);
else
kill_block_super(sb);
+ erofs_drop_internal_inodes(sbi);
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
erofs_sb_free(sbi);
@@ -825,17 +829,10 @@ static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- DBG_BUGON(!sbi);
-
erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
erofs_xattr_prefixes_cleanup(sb);
-#ifdef CONFIG_EROFS_FS_ZIP
- iput(sbi->managed_cache);
- sbi->managed_cache = NULL;
-#endif
- iput(sbi->packed_inode);
- sbi->packed_inode = NULL;
+ erofs_drop_internal_inodes(sbi);
erofs_free_dev_context(sbi->devs);
sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 19d586273b70..dad4e6c6c155 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -81,6 +81,7 @@ EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
+EROFS_ATTR_FEATURE(48bit);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -93,6 +94,7 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(ztailpacking),
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
+ ATTR_LIST(48bit),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index df2777e05661..9cf84717a92e 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -81,7 +81,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
@@ -102,7 +102,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
@@ -183,7 +183,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -286,7 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -330,7 +330,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -367,7 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
for (i = 0; i < vi->xattr_shared_count; ++i) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index d771e06db738..0671184d9cf1 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -44,8 +44,8 @@ struct z_erofs_pcluster {
/* A: point to next chained pcluster or TAILs */
struct z_erofs_pcluster *next;
- /* I: start block address of this pcluster */
- erofs_off_t index;
+ /* I: start physical position of this pcluster */
+ erofs_off_t pos;
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -73,6 +73,9 @@ struct z_erofs_pcluster {
/* I: compression algorithm format */
unsigned char algorithmformat;
+ /* I: whether compressed data is in-lined or not */
+ bool from_meta;
+
/* L: whether partial decompression or not */
bool partial;
@@ -102,14 +105,9 @@ struct z_erofs_decompressqueue {
bool eio, sync;
};
-static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
-{
- return !pcl->index;
-}
-
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
{
- return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
+ return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT;
}
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
@@ -133,7 +131,7 @@ struct z_erofs_pcluster_slab {
static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
- _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1)
};
struct z_erofs_bvec_iter {
@@ -267,7 +265,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclustersize = size;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -516,6 +513,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
struct z_erofs_pcluster *pcl = fe->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
+ pgoff_t poff = pcl->pos >> PAGE_SHIFT;
bool may_bypass = true;
/* Optimistic allocation, as in-place I/O can be used as a fallback */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
@@ -532,7 +530,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- folio = filemap_get_folio(mc, pcl->index + i);
+ folio = filemap_get_folio(mc, poff + i);
if (IS_ERR(folio)) {
may_bypass = false;
if (!shouldalloc)
@@ -575,7 +573,7 @@ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct folio *folio;
int i;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ DBG_BUGON(pcl->from_meta);
/* Each cached folio contains one page unless bs > ps is supported */
for (i = 0; i < pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page) {
@@ -607,7 +605,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
ret = false;
spin_lock(&pcl->lockref.lock);
if (pcl->lockref.count <= 0) {
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ DBG_BUGON(pcl->from_meta);
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
bvec->page = NULL;
@@ -644,18 +642,18 @@ static const struct address_space_operations z_erofs_cache_aops = {
.invalidate_folio = z_erofs_cache_invalidate_folio,
};
-int erofs_init_managed_cache(struct super_block *sb)
+int z_erofs_init_super(struct super_block *sb)
{
struct inode *const inode = new_inode(sb);
if (!inode)
return -ENOMEM;
-
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
+ xa_init(&EROFS_SB(sb)->managed_pslots);
return 0;
}
@@ -667,16 +665,20 @@ static int z_erofs_attach_page(struct z_erofs_frontend *fe,
int ret;
if (exclusive) {
- /* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->lockref.lock);
- while (fe->icur > 0) {
- if (pcl->compressed_bvecs[--fe->icur].page)
- continue;
- pcl->compressed_bvecs[fe->icur] = *bvec;
+ /* Inplace I/O is limited to one page for uncompressed data */
+ if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX ||
+ fe->icur <= 1) {
+ /* Try to prioritize inplace I/O here */
+ spin_lock(&pcl->lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->lockref.lock);
+ return 0;
+ }
spin_unlock(&pcl->lockref.lock);
- return 0;
}
- spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -711,27 +713,26 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl, *pre;
+ unsigned int pageofs_in;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- /* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(map->m_plen);
+ pageofs_in = erofs_blkoff(sb, map->m_pa);
+ pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
+ pcl->pclustersize = map->m_plen;
+ pcl->pageofs_in = pageofs_in;
pcl->length = 0;
pcl->partial = true;
pcl->next = fe->head;
+ pcl->pos = map->m_pa;
+ pcl->pageofs_in = pageofs_in;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ pcl->from_meta = map->m_flags & EROFS_MAP_META;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
@@ -741,13 +742,10 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
mutex_init(&pcl->lock);
DBG_BUGON(!mutex_trylock(&pcl->lock));
- if (ztailpacking) {
- pcl->index = 0; /* which indicates ztailpacking */
- } else {
- pcl->index = erofs_blknr(sb, map->m_pa);
+ if (!pcl->from_meta) {
while (1) {
xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos,
NULL, pcl, GFP_KERNEL);
if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
xa_unlock(&sbi->managed_pslots);
@@ -779,7 +777,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
- erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct z_erofs_pcluster *pcl = NULL;
int ret;
@@ -790,9 +787,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
if (!(map->m_flags & EROFS_MAP_META)) {
while (1) {
rcu_read_lock();
- pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa);
if (!pcl || z_erofs_get_pcluster(pcl)) {
- DBG_BUGON(pcl && blknr != pcl->index);
+ DBG_BUGON(pcl && map->m_pa != pcl->pos);
rcu_read_unlock();
break;
}
@@ -826,13 +823,13 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ if (!fe->pcl->from_meta) {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
void *mptr;
- mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
+ mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false);
if (IS_ERR(mptr)) {
ret = PTR_ERR(mptr);
erofs_err(sb, "failed to get inline data %d", ret);
@@ -871,7 +868,7 @@ static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
* It's impossible to fail after the pcluster is freezed, but in order
* to avoid some race conditions, add a DBG_BUGON to observe this.
*/
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl);
lockref_mark_dead(&pcl->lockref);
return true;
@@ -967,7 +964,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, pos, EROFS_KMAP);
+ src = erofs_bread(&buf, pos, true);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
@@ -1221,7 +1218,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl) ||
+ if (pcl->from_meta ||
erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
@@ -1284,6 +1281,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
+ .inpages = pclusterpages,
+ .outpages = be->nr_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
.inputsize = pcl->pclustersize,
@@ -1297,7 +1296,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (pcl->from_meta) {
page = pcl->compressed_bvecs[0].page;
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
@@ -1357,7 +1356,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
- if (z_erofs_is_inline_pcluster(pcl))
+ if (pcl->from_meta)
z_erofs_free_pcluster(pcl);
else
z_erofs_put_pcluster(sbi, pcl, try_free);
@@ -1538,7 +1537,7 @@ out_allocfolio:
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
+ filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@@ -1655,19 +1654,20 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f,
pcl = next;
next = READ_ONCE(pcl->next);
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (pcl->from_meta) {
z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->index),
+ .m_pa = round_down(pcl->pos, sb->s_blocksize),
};
(void)erofs_map_dev(sb, &mdev);
cur = mdev.m_pa;
- end = cur + pcl->pclustersize;
+ end = round_up(cur + pcl->pageofs_in + pcl->pclustersize,
+ sb->s_blocksize);
do {
bvec.bv_page = NULL;
if (bio && (cur != last_pa ||
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 689437e99a5a..8de50df05dfe 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) +
+ const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) +
vi->inode_isize + vi->xattr_isize) +
lcn * sizeof(struct z_erofs_lcluster_index);
struct z_erofs_lcluster_index *di;
unsigned int advise;
- di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true);
if (IS_ERR(di))
return PTR_ERR(di);
m->lcn = lcn;
@@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
advise = le16_to_cpu(di->di_advise);
m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->clusterofs = 1 << vi->z_lclusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
@@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
+ if (m->clusterofs >= 1 << vi->z_lclusterbits) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
- ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize);
+ const unsigned int lclusterbits = vi->z_lclusterbits;
const unsigned int totalidx = erofs_iblks(inode);
unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
@@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
- in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true);
if (IS_ERR(in))
return PTR_ERR(in);
@@ -255,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
{
struct super_block *sb = m->inode->i_sb;
struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lclusterbits = vi->z_lclusterbits;
while (m->lcn >= lookback_distance) {
unsigned long lcn = m->lcn - lookback_distance;
@@ -265,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
if (err)
return err;
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
if (!lookback_distance)
- goto err_bogus;
+ break;
continue;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ } else {
m->headtype = m->type;
m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
return 0;
- default:
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
}
-err_bogus:
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
@@ -308,7 +304,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
- (lcn << vi->z_logical_clusterbits) >= inode->i_size)
+ (lcn << vi->z_lclusterbits) >= inode->i_size)
m->compressedblks = 1;
if (m->compressedblks)
@@ -329,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ if (m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ if (m->compressedblks)
+ goto out;
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
/*
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 block-sized pcluster.
*/
m->compressedblks = 1;
- break;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
- if (m->delta[0] != 1)
- goto err_bonus_cblkcnt;
- if (m->compressedblks)
- break;
- fallthrough;
- default:
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ goto out;
}
+ erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
out:
m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
-err_bonus_cblkcnt:
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
@@ -365,7 +354,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
struct inode *inode = m->inode;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_map_blocks *map = m->map;
- unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned int lclusterbits = vi->z_lclusterbits;
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
@@ -386,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
m->delta[1] = 1;
DBG_BUGON(1);
}
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
if (lcn != headlcn)
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
@@ -404,23 +391,32 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return 0;
}
-static int z_erofs_do_map_blocks(struct inode *inode,
+static int z_erofs_map_blocks_fo(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
- struct erofs_inode *const vi = EROFS_I(inode);
- bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ bool ztailpacking = vi->z_idata_size;
+ unsigned int lclusterbits = vi->z_lclusterbits;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff, afmt;
+ unsigned int endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
- lclusterbits = vi->z_logical_clusterbits;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ return 0;
+ }
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
@@ -428,9 +424,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (err)
goto unmap_out;
- if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
- vi->z_idataoff = m.nextpackoff;
-
+ if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking)
+ vi->z_fragmentoff = m.nextpackoff;
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
@@ -452,8 +447,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
if (!m.lcn) {
- erofs_err(inode->i_sb,
- "invalid logical cluster 0 at nid %llu",
+ erofs_err(sb, "invalid logical cluster 0 at nid %llu",
vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -469,8 +463,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
goto unmap_out;
break;
default:
- erofs_err(inode->i_sb,
- "unknown type %u @ offset %llu of nid %llu",
+ erofs_err(sb, "unknown type %u @ offset %llu of nid %llu",
m.type, ofs, vi->nid);
err = -EOPNOTSUPP;
goto unmap_out;
@@ -487,12 +480,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
- map->m_pa = vi->z_idataoff;
+ map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map->m_plen);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_FRAGMENT;
} else {
- map->m_pa = erofs_pos(inode->i_sb, m.pblk);
+ map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto unmap_out;
@@ -511,7 +510,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
afmt, vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -535,6 +534,115 @@ unmap_out:
return err;
}
+static int z_erofs_map_blocks_ext(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER;
+ unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
+ erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize), recsz);
+ erofs_off_t lend = inode->i_size;
+ erofs_off_t l, r, mid, pa, la, lstart;
+ struct z_erofs_extent *ext;
+ unsigned int fmt;
+ bool last;
+
+ map->m_flags = 0;
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ pa = le64_to_cpu(*(__le64 *)ext);
+ pos += sizeof(__le64);
+ lstart = 0;
+ } else {
+ lstart = map->m_la >> vi->z_lclusterbits;
+ pa = EROFS_NULL_ADDR;
+ }
+
+ for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ map->m_plen = le32_to_cpu(ext->plen);
+ if (pa != EROFS_NULL_ADDR) {
+ map->m_pa = pa;
+ pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK;
+ } else {
+ map->m_pa = le32_to_cpu(ext->pstart_lo);
+ }
+ pos += recsz;
+ }
+ last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits));
+ lend = min(lstart, lend);
+ lstart -= 1 << vi->z_lclusterbits;
+ } else {
+ lstart = lend;
+ for (l = 0, r = vi->z_extents; l < r; ) {
+ mid = l + (r - l) / 2;
+ ext = erofs_read_metabuf(&map->buf, sb,
+ pos + mid * recsz, true);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+
+ la = le32_to_cpu(ext->lstart_lo);
+ pa = le32_to_cpu(ext->pstart_lo) |
+ (u64)le32_to_cpu(ext->pstart_hi) << 32;
+ if (recsz > offsetof(struct z_erofs_extent, lstart_hi))
+ la |= (u64)le32_to_cpu(ext->lstart_hi) << 32;
+
+ if (la > map->m_la) {
+ r = mid;
+ lend = la;
+ } else {
+ l = mid + 1;
+ if (map->m_la == la)
+ r = min(l + 1, r);
+ lstart = la;
+ map->m_plen = le32_to_cpu(ext->plen);
+ map->m_pa = pa;
+ }
+ }
+ last = (l >= vi->z_extents);
+ }
+
+ if (lstart < lend) {
+ map->m_la = lstart;
+ if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
+ vi->z_fragmentoff = map->m_plen;
+ if (recsz >= offsetof(struct z_erofs_extent, pstart_lo))
+ vi->z_fragmentoff |= map->m_pa << 32;
+ } else if (map->m_plen) {
+ map->m_flags |= EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
+ fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
+ if (fmt)
+ map->m_algorithmformat = fmt - 1;
+ else if (interlaced && !erofs_blkoff(sb, map->m_pa))
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
+ map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK;
+ }
+ }
+ map->m_llen = lend - map->m_la;
+ if (!last && map->m_llen < sb->s_blocksize) {
+ erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu",
+ map->m_llen, map->m_la, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
@@ -561,7 +669,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ h = erofs_read_metabuf(&buf, sb, pos, true);
if (IS_ERR(h)) {
err = PTR_ERR(h);
goto out_unlock;
@@ -578,8 +686,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto done;
}
vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15);
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) {
+ vi->z_extents = le32_to_cpu(h->h_extents_lo) |
+ ((u64)le16_to_cpu(h->h_extents_hi) << 32);
+ goto done;
+ }
+
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
headnr = 0;
if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
@@ -590,7 +710,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_put_metabuf;
}
- vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7);
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
@@ -608,34 +727,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_put_metabuf;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
+ if (vi->z_idata_size ||
+ (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
struct erofs_map_blocks map = {
.buf = __EROFS_BUF_INITIALIZER
};
- vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
-
- if (!map.m_plen ||
- erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map.m_plen);
- err = -EFSCORRUPTED;
- }
- if (err < 0)
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
- !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
- err = z_erofs_do_map_blocks(inode, &map,
+ err = z_erofs_map_blocks_fo(inode, &map,
EROFS_GET_BLOCKS_FINDTAIL);
erofs_put_metabuf(&map.buf);
if (err < 0)
@@ -666,15 +764,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
} else {
err = z_erofs_fill_inode_lazy(inode);
if (!err) {
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED |
- EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
- } else {
- err = z_erofs_do_map_blocks(inode, map, flags);
- }
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
+ err = z_erofs_map_blocks_ext(inode, map, flags);
+ else
+ err = z_erofs_map_blocks_fo(inode, map, flags);
}
if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 76129bfcd663..af42b2c7d235 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags)
if (fd < 0)
goto err;
- file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+ file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
+ ctx, flags, FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(fd);
fd = PTR_ERR(file);
goto err;
}
-
- file->f_mode |= FMODE_NOWAIT;
fd_install(fd, file);
return fd;
err:
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7c0980db77b3..100376863a44 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
u16 budget = READ_ONCE(ep->busy_poll_budget);
@@ -447,8 +447,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
if (!budget)
budget = BUSY_POLL_BUDGET;
- if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
+ napi_busy_loop(napi_id, ep_busy_loop_end,
ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
@@ -492,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* or
* Nothing to do if we already have this ID
*/
- if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
+ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
return;
/* record NAPI ID for use in next busy poll */
@@ -546,7 +546,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
napi_suspend_irqs(napi_id);
}
@@ -554,13 +554,13 @@ static void ep_resume_napi_irqs(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
napi_resume_irqs(napi_id);
}
#else
-static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static inline bool ep_busy_loop(struct eventpoll *ep)
{
return false;
}
@@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
+static int ep_try_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
+{
+ int res;
+
+ /*
+ * Try to transfer events to user space. In case we get 0 events and
+ * there's still timeout left over, we go trying again in search of
+ * more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+ return res;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2031,23 +2047,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
while (1) {
if (eavail) {
- /*
- * Try to transfer events to user space. In case we get
- * 0 events and there's still timeout left over, we go
- * trying again in search of more luck.
- */
- res = ep_send_events(ep, events, maxevents);
- if (res) {
- if (res > 0)
- ep_suspend_napi_irqs(ep);
+ res = ep_try_send_events(ep, events, maxevents);
+ if (res)
return res;
- }
}
if (timed_out)
return 0;
- eavail = ep_busy_loop(ep, timed_out);
+ eavail = ep_busy_loop(ep);
if (eavail)
continue;
@@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
+static int ep_check_params(struct file *file, struct epoll_event __user *evs,
+ int maxevents)
+{
+ /* The maximum number of event must be greater than zero */
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ /*
+ * We have to check that the file structure underneath the fd
+ * the user passed to us _is_ an eventpoll file.
+ */
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ return 0;
+}
+
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents)
+{
+ struct eventpoll *ep;
+ int ret;
+
+ ret = ep_check_params(file, events, maxevents);
+ if (unlikely(ret))
+ return ret;
+
+ ep = file->private_data;
+ /*
+ * Racy call, but that's ok - it should get retried based on
+ * poll readiness anyway.
+ */
+ if (ep_events_available(ep))
+ return ep_try_send_events(ep, events, maxevents);
+ return 0;
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
@@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
struct eventpoll *ep;
-
- /* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
- return -EINVAL;
-
- /* Verify that the area passed by the user is writeable */
- if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
- return -EFAULT;
+ int ret;
/* Get the "struct file *" for the eventpoll file */
CLASS(fd, f)(epfd);
if (fd_empty(f))
return -EBADF;
- /*
- * We have to check that the file structure underneath the fd
- * the user passed to us _is_ an eventpoll file.
- */
- if (!is_file_epoll(fd_file(f)))
- return -EINVAL;
+ ret = ep_check_params(fd_file(f), events, maxevents);
+ if (unlikely(ret))
+ return ret;
/*
* At this point it is safe to assume that the "private_data" contains
diff --git a/fs/exec.c b/fs/exec.c
index 506cd411f4ac..f45859ad13ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -755,8 +755,6 @@ int setup_arg_pages(struct linux_binprm *bprm,
mm->arg_start = bprm->p;
#endif
- if (bprm->loader)
- bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
if (mmap_write_lock_killable(mm))
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 8b30027d8251..fede0283d6e2 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -840,8 +840,8 @@ unlock:
return err;
}
-static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -851,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
loff_t size = i_size_read(dir);
if (unlikely(exfat_forced_shutdown(sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -882,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
unlock:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int exfat_check_dir_empty(struct super_block *sb,
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..b5845c4846b8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("get_parent of %lu failed, err %ld\n",
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 8346ab9534c1..bde617a66cec 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
return err;
}
-static int ext2_mkdir(struct mnt_idmap * idmap,
- struct inode * dir, struct dentry * dentry, umode_t mode)
+static struct dentry *ext2_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir, struct dentry * dentry,
+ umode_t mode)
{
struct inode * inode;
int err;
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode_inc_link_count(dir);
@@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap,
d_instantiate_new(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8042ad873808..c48fd36b2d74 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 2a135075468d..a4dbaccee6e7 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -25,7 +25,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
@@ -48,7 +48,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
@@ -67,7 +67,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -89,7 +89,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 02d47a64e8d1..d4164c507a90 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
bool fake = is_fake_dir_entry(de);
- bool has_csum = ext4_has_metadata_csum(dir->i_sb);
+ bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
@@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
/* Can we just clear INDEX flag to ignore htree information? */
- if (!ext4_has_metadata_csum(sb)) {
+ if (!ext4_has_feature_metadata_csum(sb)) {
/*
* We don't set the inode dirty flag since it's not
* critical that it gets flushed back to the disk.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4e7de7eaa374..5a20e9cd7184 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -278,7 +278,10 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
-#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_FAILED 0x0002
+
+#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@@ -367,6 +370,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@@ -1058,7 +1063,8 @@ struct ext4_inode_info {
/* Number of ongoing updates on this inode */
atomic_t i_fc_updates;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
@@ -1097,8 +1103,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- spinlock_t i_raw_lock; /* protects updates to the raw inode */
-
/*
* File creation time. Its function is same as that of
* struct timespec64 i_{a,c,m}time in the generic inode.
@@ -1141,6 +1145,7 @@ struct ext4_inode_info {
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
+ spinlock_t i_block_reservation_lock;
/* Lock protecting lists below */
spinlock_t i_completed_io_lock;
@@ -1151,8 +1156,6 @@ struct ext4_inode_info {
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- spinlock_t i_block_reservation_lock;
-
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
@@ -1606,6 +1609,8 @@ struct ext4_sb_info {
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
+ unsigned int s_sb_update_sec;
+ unsigned int s_sb_update_kb;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1821,7 +1826,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
- EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -2232,15 +2238,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
/*
* Superblock flags
*/
-#define EXT4_FLAGS_RESIZING 0
-#define EXT4_FLAGS_SHUTDOWN 1
-#define EXT4_FLAGS_BDEV_IS_DAX 2
+enum {
+ EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */
+ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */
+ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */
+ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
+};
static inline int ext4_forced_shutdown(struct super_block *sb)
{
return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
+static inline int ext4_emergency_ro(struct super_block *sb)
+{
+ return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
+}
+
+static inline int ext4_emergency_state(struct super_block *sb)
+{
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+ if (unlikely(ext4_emergency_ro(sb)))
+ return -EROFS;
+ return 0;
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -2278,6 +2301,13 @@ static inline int ext4_forced_shutdown(struct super_block *sb)
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
+ * Default values for superblock update
+ */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
+
+/*
* Minimum number of groups in a flexgroup before we separate out
* directories into the first block group of a flexgroup
*/
@@ -2810,8 +2840,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent,
struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
-extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
@@ -3001,6 +3030,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -3259,14 +3290,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_metadata_csum(struct super_block *sb)
-{
- return ext4_has_feature_metadata_csum(sb);
-}
-
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+ return ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb);
}
#define ext4_read_incompat_64bit_val(es, name) \
@@ -3546,11 +3573,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata);
+extern int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -3785,34 +3812,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-/* For ioend & aio unwritten conversion wait queues */
-#define EXT4_WQ_HASH_SZ 37
-#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
-extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
+static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- struct inode *inode = io_end->inode;
-
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- }
}
extern const struct iomap_ops ext4_iomap_ops;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index da4a82456383..135e278c832e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -63,12 +63,14 @@ static void ext4_put_nojournal(handle_t *handle)
*/
static int ext4_journal_check_start(struct super_block *sb)
{
+ int ret;
journal_t *journal;
might_sleep();
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
@@ -244,7 +246,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
}
} else
ext4_check_bdev_write_error(sb);
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
@@ -331,7 +334,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
err);
return err;
}
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..3221714d9901 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,90 +122,6 @@
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
-/**
- * struct ext4_journal_cb_entry - Base structure for callback information.
- *
- * This struct is a 'seed' structure for a using with your own callback
- * structs. If you are using callbacks you must allocate one of these
- * or another struct of your own definition which has this struct
- * as it's first element and pass it to ext4_journal_callback_add().
- */
-struct ext4_journal_cb_entry {
- /* list information for other callbacks attached to the same handle */
- struct list_head jce_list;
-
- /* Function to call with this callback structure */
- void (*jce_func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce, int error);
-
- /* user data goes here */
-};
-
-/**
- * ext4_journal_callback_add: add a function to call after transaction commit
- * @handle: active journal transaction handle to register callback on
- * @func: callback function to call after the transaction has committed:
- * @sb: superblock of current filesystem for transaction
- * @jce: returned journal callback data
- * @rc: journal state at commit (0 = transaction committed properly)
- * @jce: journal callback data (internal and function private data struct)
- *
- * The registered function will be called in the context of the journal thread
- * after the transaction for which the handle was created has completed.
- *
- * No locks are held when the callback function is called, so it is safe to
- * call blocking functions from within the callback, but the callback should
- * not block or run for too long, or the filesystem will be blocked waiting for
- * the next transaction to commit. No journaling functions can be used, or
- * there is a risk of deadlock.
- *
- * There is no guaranteed calling order of multiple registered callbacks on
- * the same transaction.
- */
-static inline void _ext4_journal_callback_add(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- /* Add the jce to transaction's private list */
- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
-}
-
-static inline void ext4_journal_callback_add(handle_t *handle,
- void (*func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce,
- int rc),
- struct ext4_journal_cb_entry *jce)
-{
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- /* Add the jce to transaction's private list */
- jce->jce_func = func;
- spin_lock(&sbi->s_md_lock);
- _ext4_journal_callback_add(handle, jce);
- spin_unlock(&sbi->s_md_lock);
-}
-
-
-/**
- * ext4_journal_callback_del: delete a registered callback
- * @handle: active journal transaction handle on which callback was registered
- * @jce: registered journal callback entry to unregister
- * Return true if object was successfully removed
- */
-static inline bool ext4_journal_callback_try_del(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- bool deleted;
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- spin_lock(&sbi->s_md_lock);
- deleted = !list_empty(&jce->jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- return deleted;
-}
-
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 1;
}
+/*
+ * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
+ * cases
+ */
+static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
+{
+ int err = 0;
+
+ /*
+ * At this point only two things can be operating on the journal.
+ * JBD2 thread performing transaction commit and s_sb_upd_work
+ * issuing sb update through the journal. Once we set
+ * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
+ * queue s_sb_upd_work and ext4_force_commit() makes sure any
+ * ext4_handle_error() calls from the running transaction commit are
+ * finished. Hence no new s_sb_upd_work can be queued after we
+ * flush it here.
+ */
+ ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);
+
+ ext4_force_commit(sbi->s_sb);
+ flush_work(&sbi->s_sb_upd_work);
+
+ err = jbd2_journal_destroy(journal);
+ sbi->s_journal = NULL;
+
+ return err;
+}
+
#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a07a98a4b97a..c616a16a9f36 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -63,7 +63,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -77,7 +77,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
@@ -4568,131 +4568,65 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- /*
- * Round up offset. This is not fallocate, we need to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range. Here, start and partial_begin are inclusive, end and
- * partial_end are exclusive.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
/* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ return ret;
}
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
-
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- /*
- * For journalled data we need to write (and checkpoint) pages
- * before discarding page cache to avoid inconsitent data on
- * disk in case of crash before zeroing trans is committed.
- */
- if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start,
- end - 1);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
- }
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ return ret;
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- filemap_invalidate_unlock(mapping);
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
+
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ return ret;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4705,27 +4639,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ return ret;
}
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
- inode_unlock(inode);
+ return ret;
+}
+
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4739,12 +4715,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ struct address_space *mapping = file->f_mapping;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4764,73 +4736,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
- inode_unlock(inode);
if (ret)
- goto exit;
-
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(file, offset, len);
- goto exit;
- }
+ goto out_inode_lock;
- if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(file, offset, len);
- goto exit;
- }
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
- if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(file, offset, len);
- goto exit;
- }
+ ret = file_modified(file);
+ if (ret)
+ goto out_inode_lock;
- if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = ext4_zero_range(file, offset, len, mode);
- goto exit;
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
}
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
-
- max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- inode_lock(inode);
/*
- * We only support preallocation for extent-based files only
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
*/
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
- }
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
+ filemap_invalidate_lock(mapping);
- ret = file_modified(file);
+ ret = ext4_break_layouts(inode);
if (ret)
- goto out;
+ goto out_invalidate_lock;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = ext4_punch_hole(file, offset, len);
+ else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ ret = ext4_collapse_range(file, offset, len);
+ else if (mode & FALLOC_FL_INSERT_RANGE)
+ ret = ext4_insert_range(file, offset, len);
+ else if (mode & FALLOC_FL_ZERO_RANGE)
+ ret = ext4_zero_range(file, offset, len, mode);
+ else
+ ret = -EOPNOTSUPP;
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
- }
-out:
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out_inode_lock:
inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-exit:
return ret;
}
@@ -5332,109 +5278,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t punch_start, punch_stop;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
+ trace_ext4_collapse_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_collapse_range(inode, offset, len);
-
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_mmap;
+ if (end >= inode->i_size)
+ return -EINVAL;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_rwsem and invalidate_lock.
- */
- ret = filemap_write_and_wait_range(mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ return ret;
+
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5442,18 +5351,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5473,100 +5380,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
+ int ret, depth, split_flag = 0;
+ loff_t start;
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
+ trace_ext4_insert_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_insert_range(inode, offset, len);
-
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
- }
-
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
+ if (offset >= inode->i_size)
+ return -EINVAL;
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
+ return ret;
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5576,16 +5446,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
- offset_lblk, split_flag,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -5594,32 +5464,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
}
ext4_free_ext_path(path);
- ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ae29832aab1e..d1401d4a5513 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1551,7 +1551,6 @@ retry:
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3bd96c3d4cd0..beb078ee4811 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -688,10 +688,12 @@ out:
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ int ret;
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
@@ -700,7 +702,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
- int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
@@ -800,11 +801,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ int ret;
struct inode *inode = file->f_mapping->host;
struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (file->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
/*
* We don't support synchronous mappings for non-DAX files and
@@ -835,7 +841,8 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
- if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !sb_start_intwrite_trylock(sb))
return 0;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
@@ -878,8 +885,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (filp->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index b40d3b29f7e5..e476c6de3074 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
- if (sb_rdonly(inode->i_sb)) {
- /* Make sure that we read updated s_ext4_flags value */
- smp_rmb();
- if (ext4_forced_shutdown(inode->i_sb))
- ret = -EROFS;
+ if (sb_rdonly(inode->i_sb))
goto out;
- }
if (!EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fsync_nojournal(file, start, end, datasync,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index deabe29da7fb..33cd5b6b02d5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (len && IS_CASEFOLDED(dir) &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
- buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+ buff = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21d228073d79..38bc8d74f4cc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -951,8 +951,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return ERR_PTR(-EIO);
+ ret2 = ext4_emergency_state(sb);
+ if (unlikely(ret2))
+ return ERR_PTR(ret2);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -1282,7 +1283,7 @@ got:
inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
@@ -1298,7 +1299,7 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
- (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
+ (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3536ca7e4fcc..f608f6554b95 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -20,6 +20,11 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ void **fsdata);
+
static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
@@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@@ -653,91 +658,109 @@ out_nofolio:
}
/*
- * Try to write data in the inode.
- * If the inode has inline data, check whether the new write can be
- * in the inode also. If not, create the page the handle, move the data
- * to the page make it update and let the later codes create extent for it.
+ * Prepare the write for the inline data.
+ * If the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
*/
-int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop)
+int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
-
- if (pos + len > ext4_get_max_inline_size(inode))
- goto convert;
+ int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
- /*
- * The possible write could happen in the inode,
- * so try to reserve the space in inode first.
- */
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
- goto out;
+ goto out_release_bh;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_stop_journal;
- /* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
- brelse(iloc.bh);
- goto convert;
- }
+ if (!da) {
+ brelse(iloc.bh);
+ /* Retry inside */
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out;
+ ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out_release_bh;
+ }
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
- goto out;
+ goto out_stop_journal;
}
- *foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
+ /* Someone else had converted it to extent */
if (!ext4_has_inline_data(inode)) {
ret = 0;
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
+ goto out_release_folio;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0) {
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
- }
+ if (ret < 0)
+ goto out_release_folio;
}
- ret = 1;
- handle = NULL;
-out_up_read:
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
+ if (ret)
+ goto out_release_folio;
+ *foliop = folio;
up_read(&EXT4_I(inode)->xattr_sem);
-out:
- if (handle && (ret != 1))
- ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return 1;
+
+out_release_folio:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ folio_unlock(folio);
+ folio_put(folio);
+out_stop_journal:
+ ext4_journal_stop(handle);
+out_release_bh:
brelse(iloc.bh);
return ret;
-convert:
- return ext4_convert_inline_data_to_extent(mapping, inode);
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop)
+{
+ if (pos + len > ext4_get_max_inline_size(inode))
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ return ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, NULL, false);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@@ -881,94 +904,6 @@ out:
return ret;
}
-/*
- * Prepare the write for the inline data.
- * If the data can be written into the inode, we just read
- * the page and make it uptodate, and start the journal.
- * Otherwise read the page, makes it dirty so that it can be
- * handle in writepages(the i_disksize update is left to the
- * normal ext4_da_write_end).
- */
-int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata)
-{
- int ret;
- handle_t *handle;
- struct folio *folio;
- struct ext4_iloc iloc;
- int retries = 0;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret)
- return ret;
-
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
-
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
-
- if (ret == -ENOSPC) {
- ext4_journal_stop(handle);
- ret = ext4_da_convert_inline_data_to_extent(mapping,
- inode,
- fsdata);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
- goto out;
- }
-
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio)) {
- ret = PTR_ERR(folio);
- goto out_journal;
- }
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- ret = 0;
- goto out_release_page;
- }
-
- if (!folio_test_uptodate(folio)) {
- ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0)
- goto out_release_page;
- }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out_release_page;
-
- up_read(&EXT4_I(inode)->xattr_sem);
- *foliop = folio;
- brelse(iloc.bh);
- return 1;
-out_release_page:
- up_read(&EXT4_I(inode)->xattr_sem);
- folio_unlock(folio);
- folio_put(folio);
-out_journal:
- ext4_journal_stop(handle);
-out:
- brelse(iloc.bh);
- return ret;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1012,7 +947,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ err = ext4_find_dest_de(dir, iloc->bh, inline_start,
inline_size, fname, &de);
if (err)
return err;
@@ -1146,7 +1081,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c54ae5fcbd4..bcb96caf77c0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -93,7 +94,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -114,7 +115,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -751,7 +752,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
- if (!bh->b_page) {
+ if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@@ -1149,8 +1150,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_write_begin(inode, pos, len);
/*
@@ -2225,7 +2227,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@@ -2273,7 +2275,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2599,10 +2601,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
- ret = -EROFS;
+ ret = ext4_emergency_state(mapping->host->i_sb);
+ if (unlikely(ret))
goto out_writepages;
- }
/*
* If we have inline data and arrive here, it means that
@@ -2817,8 +2818,9 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
@@ -2858,8 +2860,9 @@ static int ext4_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
@@ -2915,8 +2918,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
index = pos >> PAGE_SHIFT;
@@ -2929,8 +2933,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- foliop, fsdata);
+ ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
@@ -3290,6 +3294,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
@@ -3902,6 +3910,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@@ -3946,91 +4016,50 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
-
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- goto out_mutex;
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size, and also make sure that the hole
+ * within one block before last range.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
- ret = ext4_break_layouts(inode);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -4040,54 +4069,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
- ext4_lblk_t hole_len = stop_block - first_block;
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block, hole_len);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
- ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4674,6 +4700,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
int err;
+ err = xattr_check_inode(inode, IHDR(inode, raw_inode),
+ ITAIL(inode, raw_inode));
+ if (err)
+ return err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
err = ext4_find_inline_data_nolock(inode);
if (!err && ext4_has_inline_data(inode))
@@ -4800,7 +4831,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
@@ -4887,7 +4918,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
- if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ if (!ext4_has_feature_dir_index(sb) &&
+ ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@@ -5007,8 +5039,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else {
@@ -5228,8 +5268,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
@@ -5351,8 +5392,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ error = ext4_emergency_state(inode->i_sb);
+ if (unlikely(error))
+ return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -5464,7 +5506,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
oldsize & (inode->i_sb->s_blocksize - 1)) {
error = ext4_inode_attach_jinode(inode);
if (error)
- goto err_out;
+ goto out_mmap_sem;
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
@@ -5796,9 +5838,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err)) {
put_bh(iloc->bh);
- return -EIO;
+ return err;
}
ext4_fc_track_inode(handle, inode);
@@ -5822,8 +5865,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7b9ce71c1c81..d17207386ead 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -142,7 +142,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
- if (ext4_has_metadata_csum(sb) &&
+ if (ext4_has_feature_metadata_csum(sb) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
@@ -150,7 +150,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
goto out_bh;
}
func(es, arg);
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(sb, es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -351,7 +351,7 @@ void ext4_reset_inode_seed(struct inode *inode)
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
@@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp,
* If any checksums (group descriptors or metadata) are being used
* then the checksum seed feature is required to change the UUID.
*/
- if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ if (((ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb))
&& !ext4_has_feature_csum_seed(sb))
|| ext4_has_feature_stable_inodes(sb))
return -EOPNOTSUPP;
@@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -1705,7 +1706,7 @@ int ext4_update_overhead(struct super_block *sb, bool force)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sb_rdonly(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb))
return 0;
if (!force &&
(sbi->s_overhead == 0 ||
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index bb2a223b207c..d634c12f1984 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
@@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b25a27c86696..0d523e9fb3d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -187,7 +187,7 @@
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
- * /sys/fs/ext4/<partition>/mb_linear_limit
+ * /sys/fs/ext4/<partition>/mb_max_linear_groups
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -209,7 +209,7 @@
* get traversed linearly. That may result in subsequent allocations being not
* close to each other. And so, the underlying device may get filled up in a
* non-linear fashion. While that may not matter on non-rotational devices, for
- * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * rotational devices that may result in higher seek times. "mb_max_linear_groups"
* tells mballoc how many groups mballoc should search linearly before
* performing consulting above data structures for more efficient lookups. For
* non rotational devices, this value defaults to 0 and for rotational devices
@@ -5653,7 +5653,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
ngroups = ext4_get_groups_count(sb);
@@ -5687,7 +5687,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
mb_debug(sb, "Can't allocate:"
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d64c04ed061a..3e26464b1425 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -21,7 +21,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
@@ -162,7 +162,7 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) {
+ while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
if (!ext4_has_feature_mmp(sb)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 536d56d15072..cb5cb33b1d91 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!ext4_has_metadata_csum(inode->i_sb) ||
+ if (!ext4_has_feature_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -291,36 +291,6 @@ struct dx_tail {
__le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
};
-static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
-static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash(struct dx_entry *entry);
-static void dx_set_hash(struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count(struct dx_entry *entries);
-static unsigned dx_get_limit(struct dx_entry *entries);
-static void dx_set_count(struct dx_entry *entries, unsigned value);
-static void dx_set_limit(struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct ext4_filename *fname,
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame);
-static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct buffer_head *bh,
- struct dx_hash_info *hinfo,
- struct dx_map_entry *map_tail);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
- char *to, struct dx_map_entry *offsets,
- int count, unsigned int blocksize);
-static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
- unsigned int blocksize);
-static void dx_insert_block(struct dx_frame *frame,
- u32 hash, ext4_lblk_t block);
-static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
@@ -398,7 +368,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, bh);
@@ -419,7 +389,7 @@ static void ext4_dirblock_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, bh);
@@ -494,7 +464,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -523,7 +493,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -612,7 +582,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
ext4_dir_rec_len(1, NULL) -
ext4_dir_rec_len(2, NULL) - infosize;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -622,7 +592,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
unsigned int entry_space = dir->i_sb->s_blocksize -
ext4_dir_rec_len(0, dir);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -1076,7 +1046,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
- int csum = ext4_has_metadata_csum(dir->i_sb);
+ int csum = ext4_has_feature_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1320,7 +1290,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info h = *hinfo;
int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
while ((char *) de < base + buflen) {
@@ -1462,7 +1432,8 @@ static bool ext4_match(struct inode *parent,
* sure cf_name was properly initialized before
* considering the calculated hash.
*/
- if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ if (sb_no_casefold_compat_fallback(parent->i_sb) &&
+ IS_ENCRYPTED(parent) && fname->cf_name.name &&
(fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
return false;
@@ -1595,10 +1566,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR)
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
+ else if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ *res_dir == NULL && IS_CASEFOLDED(dir))
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold "
+ "failed, falling back\n"));
+ else
goto cleanup_and_exit;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -1945,7 +1921,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
@@ -2060,8 +2036,7 @@ out:
return ERR_PTR(err);
}
-int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
@@ -2143,11 +2118,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
int csum_size = 0;
int err, err2;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de);
if (err)
return err;
@@ -2252,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct fake_dirent *fde;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@@ -2396,7 +2371,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@@ -2427,7 +2402,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
/* Can we just ignore htree data? */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
EXT4_ERROR_INODE(dir,
"Directory has corrupted htree index.");
retval = -EFSCORRUPTED;
@@ -2577,8 +2552,10 @@ again:
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, frame->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
@@ -2589,8 +2566,10 @@ again:
err = ext4_journal_get_write_access(handle, sb,
(frame - 1)->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
memcpy((char *) entries2, (char *) (entries + icount1),
icount2 * sizeof(struct dx_entry));
@@ -2609,8 +2588,10 @@ again:
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
@@ -2635,8 +2616,10 @@ again:
"Creating %d level index...\n",
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
@@ -2733,7 +2716,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@@ -2973,7 +2956,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3004,19 +2987,19 @@ out:
return err;
}
-static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ return ERR_PTR(-EMLINK);
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -3066,7 +3049,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return ERR_PTR(err);
}
/*
@@ -3151,8 +3134,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
@@ -3309,8 +3293,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
trace_ext4_unlink_enter(dir, dentry);
/*
@@ -3376,8 +3361,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct fscrypt_str disk_link;
int retries = 0;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(dir->i_sb);
+ if (unlikely(err))
+ return err;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
&disk_link);
@@ -4199,8 +4185,9 @@ static int ext4_rename2(struct mnt_idmap *idmap,
{
int err;
- if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(old_dir->i_sb);
+ if (unlikely(err))
+ return err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index e5b47dda3317..c66e0cb29bd4 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -537,7 +537,7 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
ot = ext4_orphan_block_tail(sb, bh);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 69b8a7221a2b..179e54f3a3b6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
}
/*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
handle_t *handle = io_end->handle;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io_end->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
- if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
- ext4_msg(inode->i_sb, KERN_EMERG,
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
+ if (ret < 0 && !ext4_emergency_state(sb) &&
+ io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ ext4_msg(sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
+
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
#endif
}
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
+ return true;
+ if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+ io_end->flag & EXT4_IO_END_FAILED)
+ return true;
+ return false;
+}
+
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
@@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- /* Only reserved conversions from writeback should enter here */
- WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle && sbi->s_journal);
+ /* Only reserved conversions or pending IO errors will enter here. */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !io_end->handle && sbi->s_journal);
+
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
@@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
while (!list_empty(&unwritten)) {
io_end = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
list_del_init(&io_end->list);
err = ext4_end_io_end(io_end);
@@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
}
/*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
@@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
- list_empty(&io_end->list_vec)) {
- ext4_release_io_end(io_end);
+ if (io_end->flag & EXT4_IO_END_FAILED ||
+ (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))) {
+ ext4_add_complete_io(io_end);
return;
}
- ext4_add_complete_io(io_end);
+ ext4_release_io_end(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
- int err = 0;
-
if (refcount_dec_and_test(&io_end->count)) {
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_io_end_vec(io_end->handle,
- io_end);
- io_end->handle = NULL;
- ext4_clear_io_unwritten_flag(io_end);
- }
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_end_io_end(io_end);
+
ext4_release_io_end(io_end);
}
- return err;
+ return 0;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
@@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (ext4_io_end_defer_completion(io_end)) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each
@@ -522,7 +549,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
if (io->io_bio)
gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
retry_encrypt:
- bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
enc_bytes, 0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 72f77f78ae8d..b7ff0d955f0d 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1118,7 +1118,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
struct ext4_super_block *es = (struct ext4_super_block *) data;
es->s_block_group_nr = cpu_to_le16(group);
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(sb, es);
}
@@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a50e5c31b937..8122d4ffb3b5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
@@ -302,7 +301,7 @@ __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -312,7 +311,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
es->s_checksum = ext4_superblock_csum(sb, es);
@@ -448,9 +447,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
-#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
-
/*
* The ext4_maybe_update_superblock() function checks and updates the
* superblock if needed.
@@ -458,8 +454,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
* This function is designed to update the on-disk superblock only under
* certain conditions to prevent excessive disk writes and unnecessary
* waking of the disk from sleep. The superblock will be updated if:
- * 1. More than an hour has passed since the last superblock update, and
- * 2. More than 16MB have been written since the last superblock update.
+ * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
+ * superblock update
+ * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
+ * last superblock update.
*
* @sb: The superblock
*/
@@ -473,14 +471,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
__u64 lifetime_write_kbytes;
__u64 diff_size;
- if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
- !journal || (journal->j_flags & JBD2_UNMOUNT))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !(sb->s_flags & SB_ACTIVE) || !journal ||
+ journal->j_flags & JBD2_UNMOUNT)
return;
now = ktime_get_real_seconds();
last_update = ext4_get_tstamp(es, s_wtime);
- if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
+ if (likely(now - last_update < sbi->s_sb_update_sec))
return;
lifetime_write_kbytes = sbi->s_kbytes_written +
@@ -495,32 +494,18 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
*/
diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
- if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
+ if (diff_size > sbi->s_sb_update_kb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
ext4_maybe_update_superblock(sb);
-
- spin_lock(&sbi->s_md_lock);
- while (!list_empty(&txn->t_private_list)) {
- jce = list_entry(txn->t_private_list.next,
- struct ext4_journal_cb_entry, jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- jce->jce_func(sb, jce, error);
- spin_lock(&sbi->s_md_lock);
- }
- spin_unlock(&sbi->s_md_lock);
}
/*
@@ -707,11 +692,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
- if (!continue_fs && !sb_rdonly(sb)) {
- set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
- if (journal)
- jbd2_journal_abort(journal, -EIO);
- }
+ if (!continue_fs && !ext4_emergency_ro(sb) && journal)
+ jbd2_journal_abort(journal, -EIO);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, error, ino, block, func, line);
@@ -719,9 +701,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* In case the fs should keep running, we need to writeout
* superblock through the journal. Due to lock ordering
* constraints, it may not be safe to do it right here so we
- * defer superblock flushing to a workqueue.
+ * defer superblock flushing to a workqueue. We just need to be
+ * careful when the journal is already shutting down. If we get
+ * here in that case, just update the sb directly as the last
+ * transaction won't commit anyway.
*/
- if (continue_fs && journal)
+ if (continue_fs && journal &&
+ !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
@@ -737,17 +723,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
sb->s_id);
}
- if (sb_rdonly(sb) || continue_fs)
+ if (ext4_emergency_ro(sb) || continue_fs)
return;
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
- * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
- * modifications. We don't set SB_RDONLY because that requires
- * sb->s_umount semaphore and setting it without proper remount
- * procedure is confusing code such as freeze_super() leading to
- * deadlocks and other problems.
+ * We don't set SB_RDONLY because that requires sb->s_umount
+ * semaphore and setting it without proper remount procedure is
+ * confusing code such as freeze_super() leading to deadlocks
+ * and other problems.
*/
+ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}
static void update_super_work(struct work_struct *work)
@@ -765,7 +751,8 @@ static void update_super_work(struct work_struct *work)
* We use directly jbd2 functions here to avoid recursing back into
* ext4 error handling code during handling of previous errors.
*/
- if (!sb_rdonly(sbi->s_sb) && journal) {
+ if (!ext4_emergency_state(sbi->s_sb) &&
+ !sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
bool call_notify_err = false;
@@ -819,7 +806,7 @@ void __ext4_error(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -844,7 +831,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -879,7 +866,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -959,7 +946,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@@ -1053,7 +1040,7 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -1306,18 +1293,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
- flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
- err = jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ err = ext4_journal_destroy(sbi, sbi->s_journal);
if ((err < 0) && !aborted) {
ext4_abort(sb, -err, "Couldn't clean up the journal");
}
- }
+ } else
+ flush_work(&sbi->s_sb_upd_work);
ext4_es_unregister_shrinker(sbi);
timer_shutdown_sync(&sbi->s_err_report);
@@ -1325,13 +1311,14 @@ static void ext4_put_super(struct super_block *sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
- if (!sb_rdonly(sb) && !aborted) {
- ext4_clear_feature_journal_needs_recovery(sb);
- ext4_clear_feature_orphan_present(sb);
- es->s_state = cpu_to_le16(sbi->s_mount_state);
- }
- if (!sb_rdonly(sb))
+ if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
+ if (!aborted) {
+ ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ }
ext4_commit_super(sb);
+ }
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
@@ -1426,7 +1413,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
@@ -2785,6 +2771,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+ if (ext4_emergency_ro(sb))
+ SEQ_OPTS_PUTS("emergency_ro");
+
+ if (ext4_forced_shutdown(sb))
+ SEQ_OPTS_PUTS("shutdown");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3205,7 +3204,7 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sbi->s_sb)) {
+ if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__u32 csum32;
__u16 dummy_csum = 0;
@@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
- !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
+ test_opt(sb, INIT_INODE_TABLE)) {
elr->lr_next_group = elr->lr_first_not_zeroed;
elr->lr_mode = EXT4_LI_MODE_ITABLE;
ret = 0;
@@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (sb_rdonly(sb) ||
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
(test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
(first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
@@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb,
if (ext4_has_feature_fast_commit(sb))
set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
@@ -4642,7 +4642,8 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
+ else if (ext4_has_feature_metadata_csum(sb) ||
+ ext4_has_feature_ea_inode(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
@@ -4973,10 +4974,7 @@ static int ext4_load_and_init_journal(struct super_block *sb,
return 0;
out:
- /* flush s_sb_upd_work before destroying the journal. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
return -EINVAL;
}
@@ -5013,6 +5011,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb)
return 0;
}
+static const char *ext4_has_journal_option(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ return "journal_async_commit";
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
+ return "journal_checksum";
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ return "commit=";
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
+ return "data=";
+ if (test_opt(sb, DATA_ERR_ABORT))
+ return "data_err=abort";
+ return NULL;
+}
+
static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
int silent)
{
@@ -5263,6 +5279,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
+ sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
/*
* set default s_li_wait_mult for lazyinit, for the case there is
@@ -5404,30 +5422,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
"suppressed and not mounted read-only");
goto failed_mount3a;
} else {
+ const char *journal_option;
+
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
+ journal_option = ext4_has_journal_option(sb);
+ if (journal_option != NULL) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with %s, fs mounted w/o journal",
+ journal_option);
goto failed_mount3a;
}
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount3a;
- }
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "commit=%lu, fs mounted w/o journal",
- sbi->s_commit_interval / HZ);
- goto failed_mount3a;
- }
- if (EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "data=, fs mounted w/o journal");
- goto failed_mount3a;
- }
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@@ -5616,9 +5621,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount9;
}
- if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
ext4_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but the device does not support discard");
+ clear_opt(sb, DISCARD);
+ }
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5665,10 +5672,7 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- /* flush s_sb_upd_work before journal destroy. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
@@ -5773,10 +5777,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- if (test_opt(sb, DATA_ERR_ABORT))
- journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
- else
- journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
/*
* Always enable journal cycle record option, letting the journal
* records log transactions continuously between each mount.
@@ -5973,7 +5973,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
return journal;
out_journal:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
bdev_fput(bdev_file);
return ERR_PTR(errno);
@@ -6090,8 +6090,7 @@ static int ext4_load_journal(struct super_block *sb,
EXT4_SB(sb)->s_journal = journal;
err = ext4_clear_journal_err(sb, es);
if (err) {
- EXT4_SB(sb)->s_journal = NULL;
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6109,7 +6108,7 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
err_out:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6336,8 +6335,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6419,7 +6419,7 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
@@ -6575,7 +6575,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (ext4_forced_shutdown(sb)) {
+ if (ext4_emergency_state(sb)) {
err = -EROFS;
goto restore_opts;
}
@@ -6780,6 +6780,7 @@ static int ext4_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int ret;
+ bool old_ro = sb_rdonly(sb);
fc->s_fs_info = EXT4_SB(sb);
@@ -6791,9 +6792,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
- &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
+ &sb->s_uuid,
+ (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
return 0;
}
@@ -6817,22 +6818,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -6935,12 +6943,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@@ -6951,6 +6972,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
@@ -7288,7 +7313,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -7381,12 +7406,9 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-/* Shared across all ext4 file systems */
-wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
static int __init ext4_init_fs(void)
{
- int i, err;
+ int err;
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
@@ -7394,9 +7416,6 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ext4__ioend_wq[i]);
-
err = ext4_init_es();
if (err)
return err;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index ddb54608ca2e..987bd00f916a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
+EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
+EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
ATTR_LIST(last_trim_minblks),
+ ATTR_LIST(sb_update_sec),
+ ATTR_LIST(sb_update_kb),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7647e9f6e190..7ab8f2e8e815 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
struct ext4_xattr_header *hdr = BHDR(bh);
int ret = 1;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
lock_buffer(bh);
ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
bh->b_blocknr, hdr));
@@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
static void ext4_xattr_block_csum_set(struct inode *inode,
struct buffer_head *bh)
{
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
bh->b_blocknr, BHDR(bh));
}
@@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
-static inline int
+int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
@@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
function, line);
}
-#define xattr_check_inode(inode, header, end) \
- __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-
static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
void *end, int name_index, const char *name, int sorted)
@@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
+ end = ITAIL(inode, raw_inode);
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
@@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
- void *end;
int error;
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
-cleanup:
brelse(iloc.bh);
return error;
}
@@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
qsize_t ea_inode_refs = 0;
- void *end;
int ret;
lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
@@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- ret = xattr_check_inode(inode, header, end);
- if (ret)
- goto out;
for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ ext4_get_inode_loc(parent, &iloc);
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
- is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ is->s.end = ITAIL(inode, raw_inode);
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = xattr_check_inode(inode, header, is->s.end);
- if (error)
- return error;
/* Find the named attribute. */
error = xattr_find_entry(inode, &is->s.here, is->s.end,
i->name_index, i->name, 0);
@@ -2786,14 +2775,10 @@ retry:
*/
base = IFIRST(header);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ end = ITAIL(inode, raw_inode);
min_offs = end - base;
total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
-
ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
if (ifree >= isize_diff)
goto shift;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b25c2d7b5f99..1fedf44d4fb6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -67,6 +67,9 @@ struct ext4_xattr_entry {
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
+#define ITAIL(inode, raw_inode) \
+ ((void *)(raw_inode) + \
+ EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
@@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
+extern int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line);
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index efda9a022981..cf77987d0698 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -21,7 +21,7 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
-#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
@@ -58,7 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
- struct page *page;
+ struct folio *folio;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
@@ -74,37 +74,37 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
if (unlikely(!is_meta))
fio.op_flags &= ~REQ_META;
repeat:
- page = f2fs_grab_cache_page(mapping, index, false);
- if (!page) {
+ folio = f2fs_grab_cache_folio(mapping, index, false);
+ if (IS_ERR(folio)) {
cond_resched();
goto repeat;
}
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto out;
- fio.page = page;
+ fio.page = &folio->page;
err = f2fs_submit_page_bio(&fio);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
- lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping)) {
+ f2fs_folio_put(folio, true);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
- f2fs_handle_page_eio(sbi, page_folio(page), META);
- f2fs_put_page(page, 1);
+ if (unlikely(!folio_test_uptodate(folio))) {
+ f2fs_handle_page_eio(sbi, folio, META);
+ f2fs_folio_put(folio, true);
return ERR_PTR(-EIO);
}
out:
- return page;
+ return &folio->page;
}
struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
@@ -381,12 +381,6 @@ redirty_out:
return AOP_WRITEPAGE_ACTIVATE;
}
-static int f2fs_write_meta_page(struct page *page,
- struct writeback_control *wbc)
-{
- return __f2fs_write_meta_page(page, wbc, FS_META_IO);
-}
-
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -507,7 +501,6 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
}
const struct address_space_operations f2fs_meta_aops = {
- .writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
.dirty_folio = f2fs_dirty_meta_folio,
.invalidate_folio = f2fs_invalidate_folio,
@@ -1237,7 +1230,7 @@ static int block_operations(struct f2fs_sb_info *sbi)
retry_flush_quotas:
f2fs_lock_all(sbi);
if (__need_flush_quota(sbi)) {
- int locked;
+ bool need_lock = sbi->umount_lock_holder != current;
if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
@@ -1246,11 +1239,13 @@ retry_flush_quotas:
}
f2fs_unlock_all(sbi);
- /* only failed during mount/umount/freeze/quotactl */
- locked = down_read_trylock(&sbi->sb->s_umount);
- f2fs_quota_sync(sbi->sb, -1);
- if (locked)
+ /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */
+ if (!need_lock) {
+ f2fs_do_quota_sync(sbi->sb, -1);
+ } else if (down_read_trylock(&sbi->sb->s_umount)) {
+ f2fs_do_quota_sync(sbi->sb, -1);
up_read(&sbi->sb->s_umount);
+ }
cond_resched();
goto retry_flush_quotas;
}
@@ -1344,21 +1339,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long flags;
- if (cpc->reason & CP_UMOUNT) {
- if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
- NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) {
- clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Disable nat_bits due to no space");
- } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
- f2fs_nat_bitmap_enabled(sbi)) {
- f2fs_enable_nat_bits(sbi);
- set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Rebuild and enable nat_bits");
- }
- }
-
spin_lock_irqsave(&sbi->cp_lock, flags);
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
+ sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+ disable_nat_bits(sbi, false);
+
if (cpc->reason & CP_TRIMMED)
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
else
@@ -1541,8 +1528,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_next_addr(sbi);
/* write nat bits */
- if ((cpc->reason & CP_UMOUNT) &&
- is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
+ if (enabled_nat_bits(sbi, cpc)) {
__u64 cp_ver = cur_cp_version(ckpt);
block_t blk;
@@ -1867,7 +1853,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
struct cp_control cpc;
cpc.reason = __get_cp_reason(sbi);
- if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
+ if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC ||
+ sbi->umount_lock_holder == current) {
int ret;
f2fs_down_write(&sbi->gc_lock);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 985690d81a82..9b94810675c1 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1150,6 +1150,7 @@ retry:
f2fs_compress_ctx_add_page(cc, page_folio(page));
if (!PageUptodate(page)) {
+ f2fs_handle_page_eio(sbi, page_folio(page), DATA);
release_and_retry:
f2fs_put_rpages(cc);
f2fs_unlock_rpages(cc, i + 1);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index de4da6d9cd93..54f89f0ee69b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -319,8 +319,7 @@ static void f2fs_read_end_io(struct bio *bio)
static void f2fs_write_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
iostat_update_and_unbind_ctx(bio);
sbi = bio->bi_private;
@@ -328,34 +327,41 @@ static void f2fs_write_end_io(struct bio *bio)
if (time_to_inject(sbi, FAULT_WRITE_IO))
bio->bi_status = BLK_STS_IOERR;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- enum count_type type = WB_DATA_TYPE(page, false);
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ enum count_type type;
+
+ if (fscrypt_is_bounce_folio(folio)) {
+ struct folio *io_folio = folio;
- fscrypt_finalize_bounce_page(&page);
+ folio = fscrypt_pagecache_folio(io_folio);
+ fscrypt_free_bounce_page(&io_folio->page);
+ }
#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_is_compressed_page(page)) {
- f2fs_compress_write_end_io(bio, page);
+ if (f2fs_is_compressed_page(&folio->page)) {
+ f2fs_compress_write_end_io(bio, &folio->page);
continue;
}
#endif
+ type = WB_DATA_TYPE(&folio->page, false);
+
if (unlikely(bio->bi_status)) {
- mapping_set_error(page->mapping, -EIO);
+ mapping_set_error(folio->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
f2fs_stop_checkpoint(sbi, true,
STOP_CP_REASON_WRITE_FAIL);
}
- f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
- page_folio(page)->index != nid_of_node(page));
+ f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) &&
+ folio->index != nid_of_node(&folio->page));
dec_page_count(sbi, type);
- if (f2fs_in_warm_node_list(sbi, page))
- f2fs_del_fsync_node_entry(sbi, page);
- clear_page_private_gcing(page);
- end_page_writeback(page);
+ if (f2fs_in_warm_node_list(sbi, folio))
+ f2fs_del_fsync_node_entry(sbi, &folio->page);
+ clear_page_private_gcing(&folio->page);
+ folio_end_writeback(folio);
}
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
@@ -413,6 +419,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
{
unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0);
+ struct folio *fio_folio = page_folio(fio->page);
unsigned int fua_flag, meta_flag, io_flag;
blk_opf_t op_flags = 0;
@@ -438,6 +445,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
op_flags |= REQ_META;
if (BIT(fio->temp) & fua_flag)
op_flags |= REQ_FUA;
+
+ if (fio->type == DATA &&
+ F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE)
+ op_flags |= REQ_PRIO;
+
return op_flags;
}
@@ -876,6 +888,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
struct bio *bio = *fio->bio;
struct page *page = fio->encrypted_page ?
fio->encrypted_page : fio->page;
+ struct folio *folio = page_folio(fio->page);
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
@@ -889,8 +902,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
- f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
- page_folio(fio->page)->index, fio, GFP_NOIO);
+ f2fs_set_bio_crypt_ctx(bio, folio->mapping->host,
+ folio->index, fio, GFP_NOIO);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
@@ -899,8 +912,7 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
- PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio));
inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
@@ -1041,8 +1053,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages),
REQ_OP_READ | op_flag,
for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
- if (!bio)
- return ERR_PTR(-ENOMEM);
bio->bi_iter.bi_sector = sector;
f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
bio->bi_end_io = f2fs_read_end_io;
@@ -1193,18 +1203,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write,
- pgoff_t *next_pgofs)
+struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
- struct page *page;
+ struct folio *folio;
int err;
- page = f2fs_grab_cache_page(mapping, index, for_write);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = f2fs_grab_cache_folio(mapping, index, for_write);
+ if (IS_ERR(folio))
+ return folio;
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
@@ -1239,9 +1248,9 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
goto put_err;
}
got_it:
- if (PageUptodate(page)) {
- unlock_page(page);
- return page;
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ return folio;
}
/*
@@ -1252,48 +1261,51 @@ got_it:
* f2fs_init_inode_metadata.
*/
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_SIZE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- unlock_page(page);
- return page;
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return folio;
}
- err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr,
+ err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
op_flags, for_write);
if (err)
goto put_err;
- return page;
+ return folio;
put_err:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index,
pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
- page = find_get_page_flags(mapping, index, FGP_ACCESSED);
- if (page && PageUptodate(page))
- return page;
- f2fs_put_page(page, 0);
+ folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
+ goto read;
+ if (folio_test_uptodate(folio))
+ return folio;
+ f2fs_folio_put(folio, false);
- page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
- if (IS_ERR(page))
- return page;
+read:
+ folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs);
+ if (IS_ERR(folio))
+ return folio;
- if (PageUptodate(page))
- return page;
+ if (folio_test_uptodate(folio))
+ return folio;
- wait_on_page_locked(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 0);
+ folio_wait_locked(folio);
+ if (unlikely(!folio_test_uptodate(folio))) {
+ f2fs_folio_put(folio, false);
return ERR_PTR(-EIO);
}
- return page;
+ return folio;
}
/*
@@ -1301,23 +1313,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index,
bool for_write)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
- page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
- if (IS_ERR(page))
- return page;
+ folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL);
+ if (IS_ERR(folio))
+ return folio;
/* wait for read completion */
- lock_page(page);
- if (unlikely(page->mapping != mapping || !PageUptodate(page))) {
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) {
+ f2fs_folio_put(folio, true);
return ERR_PTR(-EIO);
}
- return page;
+ return folio;
}
/*
@@ -2178,6 +2190,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
int i;
int ret = 0;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
+ from_dnode = false;
+ goto out_put_dnode;
+ }
+
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) +
@@ -2221,10 +2239,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (ret)
goto out;
- if (unlikely(f2fs_cp_error(sbi))) {
- ret = -EIO;
- goto out_put_dnode;
- }
f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
skip_reading_dnode:
@@ -2500,7 +2514,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
return 0;
retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
+ fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page),
PAGE_SIZE, 0, gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
/* flush pending IOs and wait for a while in the ENOMEM case */
@@ -2921,29 +2935,6 @@ redirty_out:
return err;
}
-static int f2fs_write_data_page(struct page *page,
- struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- struct inode *inode = folio->mapping->host;
-
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
- goto out;
-
- if (f2fs_compressed_file(inode)) {
- if (f2fs_is_compressed_cluster(inode, folio->index)) {
- folio_redirty_for_writepage(wbc, folio);
- return AOP_WRITEPAGE_ACTIVATE;
- }
- }
-out:
-#endif
-
- return f2fs_write_single_data_page(folio, NULL, NULL, NULL,
- wbc, FS_DATA_IO, 0, true);
-}
-
/*
* This function was copied from write_cache_pages from mm/page-writeback.c.
* The major change is making write step of cold data page separately from
@@ -3266,10 +3257,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
int ret;
bool locked = false;
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
/* skip writing if there is no dirty page in this inode */
if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
return 0;
@@ -3390,7 +3377,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
restart:
/* check inline_data */
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto unlock_out;
@@ -3453,7 +3440,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
struct page *ipage;
int err = 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -3483,7 +3470,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto unlock_out;
@@ -4101,7 +4088,6 @@ static void f2fs_swap_deactivate(struct file *file)
const struct address_space_operations f2fs_dblock_aops = {
.read_folio = f2fs_read_data_folio,
.readahead = f2fs_readahead,
- .writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
@@ -4195,7 +4181,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
- if (flags & IOMAP_WRITE)
+
+ /*
+ * If the blocks being overwritten are already allocated,
+ * f2fs_map_lock and f2fs_balance_fs are not necessary.
+ */
+ if ((flags & IOMAP_WRITE) &&
+ !f2fs_overwrite_io(inode, offset, length))
map.m_may_create = true;
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 468828288a4a..16c2dfb4f595 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->ndonate_files = sbi->donate_files;
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->aw_cnt = atomic_read(&sbi->atomic_files);
@@ -501,6 +502,8 @@ static int stat_show(struct seq_file *s, void *v)
si->compr_inode, si->compr_blocks);
seq_printf(s, " - Swapfile Inode: %u\n",
si->swapfile_inode);
+ seq_printf(s, " - Donate Inode: %u\n",
+ si->ndonate_files);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 54dd52de7269..5a63ff0df03b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -551,7 +551,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
}
} else {
- page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino);
+ page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
return page;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1afa7be16e7d..f1576dc6ec67 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -62,6 +62,7 @@ enum {
FAULT_BLKADDR_VALIDITY,
FAULT_BLKADDR_CONSISTENCE,
FAULT_NO_SEGMENT,
+ FAULT_INCONSISTENT_FOOTER,
FAULT_MAX,
};
@@ -114,6 +115,13 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_GC_MERGE 0x02000000
#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000
#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000
+#define F2FS_MOUNT_NAT_BITS 0x10000000
+#define F2FS_MOUNT_INLINECRYPT 0x20000000
+/*
+ * Some f2fs environments expect to be able to pass the "lazytime" option
+ * string rather than using the MS_LAZYTIME flag, so this must remain.
+ */
+#define F2FS_MOUNT_LAZYTIME 0x40000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -830,6 +838,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
+ unsigned int ioprio_hint; /* hint for IO priority */
struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
@@ -849,6 +858,11 @@ struct f2fs_inode_info {
#endif
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
+
+ /* linked in global inode list for cache donation */
+ struct list_head gdonate_list;
+ pgoff_t donate_start, donate_end; /* inclusive */
+
struct task_struct *atomic_write_task; /* store atomic write task */
struct extent_tree *extent_tree[NR_EXTENT_CACHES];
/* cached extent_tree entry */
@@ -1273,6 +1287,7 @@ enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
DIRTY_META, /* for all dirtied inode metadata */
+ DONATE_INODE, /* for all inode to donate pages */
NR_INODE_TYPE,
};
@@ -1628,6 +1643,9 @@ struct f2fs_sb_info {
unsigned int warm_data_age_threshold;
unsigned int last_age_weight;
+ /* control donate caches */
+ unsigned int donate_files;
+
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
@@ -1659,6 +1677,7 @@ struct f2fs_sb_info {
unsigned int nquota_files; /* # of quota sysfile */
struct f2fs_rwsem quota_sem; /* blocking cp for flags */
+ struct task_struct *umount_lock_holder; /* s_umount lock holder */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
@@ -1800,6 +1819,9 @@ struct f2fs_sb_info {
u64 committed_atomic_block;
u64 revoked_atomic_block;
+ /* carve out reserved_blocks from total blocks */
+ bool carve_out;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -2015,7 +2037,7 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
return (struct f2fs_checkpoint *)(sbi->ckpt);
}
-static inline struct f2fs_node *F2FS_NODE(struct page *page)
+static inline struct f2fs_node *F2FS_NODE(const struct page *page)
{
return (struct f2fs_node *)page_address(page);
}
@@ -2219,6 +2241,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
#endif
}
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+ unsigned long flags;
+ unsigned char *nat_bits;
+
+ /*
+ * In order to re-enable nat_bits we need to call fsck.f2fs by
+ * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+ * so let's rely on regular fsck or unclean shutdown.
+ */
+
+ if (lock)
+ spin_lock_irqsave(&sbi->cp_lock, flags);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+ nat_bits = NM_I(sbi)->nat_bits;
+ NM_I(sbi)->nat_bits = NULL;
+ if (lock)
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
+
+ kvfree(nat_bits);
+}
+
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
+{
+ bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
f2fs_down_read(&sbi->cp_rwsem);
@@ -2765,33 +2817,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
}
-static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
- pgoff_t index, bool for_write)
+static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping,
+ pgoff_t index, bool for_write)
{
- struct page *page;
+ struct folio *folio;
unsigned int flags;
if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) {
+ fgf_t fgf_flags;
+
if (!for_write)
- page = find_get_page_flags(mapping, index,
- FGP_LOCK | FGP_ACCESSED);
+ fgf_flags = FGP_LOCK | FGP_ACCESSED;
else
- page = find_lock_page(mapping, index);
- if (page)
- return page;
+ fgf_flags = FGP_LOCK;
+ folio = __filemap_get_folio(mapping, index, fgf_flags, 0);
+ if (!IS_ERR(folio))
+ return folio;
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
if (!for_write)
- return grab_cache_page(mapping, index);
+ return filemap_grab_folio(mapping, index);
flags = memalloc_nofs_save();
- page = grab_cache_page_write_begin(mapping, index);
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
memalloc_nofs_restore(flags);
- return page;
+ return folio;
+}
+
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+ pgoff_t index, bool for_write)
+{
+ struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write);
+
+ if (IS_ERR(folio))
+ return NULL;
+ return &folio->page;
}
static inline struct page *f2fs_pagecache_get_page(
@@ -2804,16 +2869,23 @@ static inline struct page *f2fs_pagecache_get_page(
return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
}
-static inline void f2fs_put_page(struct page *page, int unlock)
+static inline void f2fs_folio_put(struct folio *folio, bool unlock)
{
- if (!page)
+ if (!folio)
return;
if (unlock) {
- f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
- unlock_page(page);
+ f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio));
+ folio_unlock(folio);
}
- put_page(page);
+ folio_put(folio);
+}
+
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+ if (!page)
+ return;
+ f2fs_folio_put(page_folio(page), unlock);
}
static inline void f2fs_put_dnode(struct dnode_of_data *dn)
@@ -3624,7 +3696,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
int f2fs_dquot_initialize(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
-int f2fs_quota_sync(struct super_block *sb, int type);
+int f2fs_do_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag);
@@ -3647,7 +3719,8 @@ struct node_info;
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
-bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi,
+ const struct folio *folio);
void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
@@ -3662,12 +3735,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
int f2fs_truncate_xattr_node(struct inode *inode);
int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
unsigned int seq_id);
-bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi);
int f2fs_remove_inode_page(struct inode *inode);
struct page *f2fs_new_inode_page(struct inode *inode);
struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
+struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino);
+struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid);
struct page *f2fs_get_node_page_ra(struct page *parent, int start);
int f2fs_move_node_page(struct page *node_page, int gc_type);
void f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
@@ -3687,7 +3762,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
-void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi);
int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
@@ -3758,8 +3832,10 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_io_info *fio);
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
block_t blkaddr, unsigned int blkcnt);
-void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered, bool locked);
+void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
+ bool ordered, bool locked);
+#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \
+ f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked)
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len);
@@ -3871,11 +3947,11 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
-struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
- pgoff_t *next_pgofs);
-struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs);
+struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index,
bool for_write);
struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size);
@@ -3902,6 +3978,22 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi);
void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
extern const struct iomap_ops f2fs_iomap_ops;
+static inline struct page *f2fs_find_data_page(struct inode *inode,
+ pgoff_t index, pgoff_t *next_pgofs)
+{
+ struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs);
+
+ return &folio->page;
+}
+
+static inline struct page *f2fs_get_lock_data_page(struct inode *inode,
+ pgoff_t index, bool for_write)
+{
+ struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write);
+
+ return &folio->page;
+}
+
/*
* gc.c
*/
@@ -3966,7 +4058,8 @@ struct f2fs_stat_info {
unsigned long long allocated_data_blocks;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
- unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
+ unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+ unsigned int nquota_files, ndonate_files;
int nats, dirty_nats, sits, dirty_sits;
int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
@@ -4231,6 +4324,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
struct shrink_control *sc);
unsigned long f2fs_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc);
+unsigned int f2fs_donate_files(void);
+void f2fs_reclaim_caches(unsigned int reclaim_caches_kb);
void f2fs_join_shrinker(struct f2fs_sb_info *sbi);
void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f92a9fba9991..abbcbb5865a3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -707,31 +707,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
loff_t offset = from & (PAGE_SIZE - 1);
pgoff_t index = from >> PAGE_SHIFT;
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
if (!offset && !cache_only)
return 0;
if (cache_only) {
- page = find_lock_page(mapping, index);
- if (page && PageUptodate(page))
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio))
+ return 0;
+ if (folio_test_uptodate(folio))
goto truncate_out;
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return 0;
}
- page = f2fs_get_lock_data_page(inode, index, true);
- if (IS_ERR(page))
- return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
+ folio = f2fs_get_lock_data_folio(inode, index, true);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio);
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA, true, true);
- zero_user(page, offset, PAGE_SIZE - offset);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
+ folio_zero_segment(folio, offset, folio_size(folio));
/* An encrypted inode should have a key and truncate the last page. */
f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
if (!cache_only)
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
return 0;
}
@@ -759,7 +761,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
if (lock)
f2fs_lock_op(sbi);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto out;
@@ -1834,18 +1836,32 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
map.m_len = sec_blks;
next_alloc:
+ f2fs_down_write(&sbi->pin_sem);
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (has_not_enough_free_secs(sbi, 0, 0)) {
+ f2fs_up_write(&sbi->pin_sem);
+ err = -ENOSPC;
+ f2fs_warn_ratelimited(sbi,
+ "ino:%lu, start:%lu, end:%lu, need to trigger GC to "
+ "reclaim enough free segment when checkpoint is enabled",
+ inode->i_ino, pg_start, pg_end);
+ goto out_err;
+ }
+ }
+
if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ?
ZONED_PIN_SEC_REQUIRED_COUNT :
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
f2fs_down_write(&sbi->gc_lock);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
- if (err && err != -ENODATA)
+ if (err && err != -ENODATA) {
+ f2fs_up_write(&sbi->pin_sem);
goto out_err;
+ }
}
- f2fs_down_write(&sbi->pin_sem);
-
err = f2fs_allocate_pinning_section(sbi);
if (err) {
f2fs_up_write(&sbi->pin_sem);
@@ -2448,6 +2464,52 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
return ret;
}
+static void f2fs_keep_noreuse_range(struct inode *inode,
+ loff_t offset, loff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode));
+ u64 start, end;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ if (offset >= max_bytes || len > max_bytes ||
+ (offset + len) > max_bytes)
+ return;
+
+ start = offset >> PAGE_SHIFT;
+ end = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+ inode_lock(inode);
+ if (f2fs_is_atomic_file(inode)) {
+ inode_unlock(inode);
+ return;
+ }
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ /* let's remove the range, if len = 0 */
+ if (!len) {
+ if (!list_empty(&F2FS_I(inode)->gdonate_list)) {
+ list_del_init(&F2FS_I(inode)->gdonate_list);
+ sbi->donate_files--;
+ }
+ } else {
+ if (list_empty(&F2FS_I(inode)->gdonate_list)) {
+ list_add_tail(&F2FS_I(inode)->gdonate_list,
+ &sbi->inode_list[DONATE_INODE]);
+ sbi->donate_files++;
+ } else {
+ list_move_tail(&F2FS_I(inode)->gdonate_list,
+ &sbi->inode_list[DONATE_INODE]);
+ }
+ F2FS_I(inode)->donate_start = start;
+ F2FS_I(inode)->donate_end = end - 1;
+ }
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ inode_unlock(inode);
+}
+
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3446,6 +3508,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg)
(u32 __user *)arg);
}
+static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u32 level;
+
+ if (get_user(level, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX)
+ return -EINVAL;
+
+ inode_lock(inode);
+ F2FS_I(inode)->ioprio_hint = level;
+ inode_unlock(inode);
+ return 0;
+}
+
int f2fs_precache_extents(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -4547,6 +4626,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_compress_file(filp);
case F2FS_IOC_GET_DEV_ALIAS_FILE:
return f2fs_ioc_get_dev_alias_file(filp, arg);
+ case F2FS_IOC_IO_PRIO:
+ return f2fs_ioc_io_prio(filp, arg);
default:
return -ENOTTY;
}
@@ -5147,12 +5228,16 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
}
err = generic_fadvise(filp, offset, len, advice);
- if (!err && advice == POSIX_FADV_DONTNEED &&
- test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) &&
- f2fs_compressed_file(inode))
- f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino);
+ if (err)
+ return err;
- return err;
+ if (advice == POSIX_FADV_DONTNEED &&
+ (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) &&
+ f2fs_compressed_file(inode)))
+ f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino);
+ else if (advice == POSIX_FADV_NOREUSE)
+ f2fs_keep_noreuse_range(inode, offset, len);
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -5261,6 +5346,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_DECOMPRESS_FILE:
case F2FS_IOC_COMPRESS_FILE:
case F2FS_IOC_GET_DEV_ALIAS_FILE:
+ case F2FS_IOC_IO_PRIO:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index faf9fa1c804d..2b8f9239bede 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1449,14 +1449,14 @@ out:
}
static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
- unsigned int segno, int off)
+ unsigned int segno, int off)
{
- struct page *page;
+ struct folio *folio;
int err = 0;
- page = f2fs_get_lock_data_page(inode, bidx, true);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_lock_data_folio(inode, bidx, true);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
err = -ENOENT;
@@ -1468,12 +1468,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
if (gc_type == BG_GC) {
- if (folio_test_writeback(page_folio(page))) {
+ if (folio_test_writeback(folio)) {
err = -EAGAIN;
goto out;
}
- set_page_dirty(page);
- set_page_private_gcing(page);
+ folio_mark_dirty(folio);
+ set_page_private_gcing(&folio->page);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
@@ -1483,37 +1483,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
.old_blkaddr = NULL_ADDR,
- .page = page,
+ .page = &folio->page,
.encrypted_page = NULL,
.need_lock = LOCK_REQ,
.io_type = FS_GC_DATA_IO,
};
- bool is_dirty = PageDirty(page);
+ bool is_dirty = folio_test_dirty(folio);
retry:
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- set_page_dirty(page);
- if (clear_page_dirty_for_io(page)) {
+ folio_mark_dirty(folio);
+ if (folio_clear_dirty_for_io(folio)) {
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
}
- set_page_private_gcing(page);
+ set_page_private_gcing(&folio->page);
err = f2fs_do_write_data_page(&fio);
if (err) {
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
if (err == -ENOMEM) {
memalloc_retry_wait(GFP_NOFS);
goto retry;
}
if (is_dirty)
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
}
out:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
@@ -1542,7 +1542,6 @@ next_step:
entry = sum;
for (off = 0; off < usable_blks_in_seg; off++, entry++) {
- struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
@@ -1585,6 +1584,7 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 3) {
+ struct folio *data_folio;
int err;
inode = f2fs_iget(sb, dni.ino);
@@ -1635,15 +1635,15 @@ next_step:
continue;
}
- data_page = f2fs_get_read_data_page(inode, start_bidx,
+ data_folio = f2fs_get_read_data_folio(inode, start_bidx,
REQ_RAHEAD, true, NULL);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (IS_ERR(data_page)) {
+ if (IS_ERR(data_folio)) {
iput(inode);
continue;
}
- f2fs_put_page(data_page, 0);
+ f2fs_folio_put(data_folio, false);
add_gc_inode(gc_list, inode);
continue;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3e3c35d4c98b..ad92e9008781 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -119,7 +119,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio)
{
struct page *ipage;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage)) {
folio_unlock(folio);
return PTR_ERR(ipage);
@@ -237,7 +237,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_lock_op(sbi);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto out;
@@ -265,7 +265,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *ipage;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -312,7 +312,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
if (f2fs_has_inline_data(inode) &&
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -331,7 +331,7 @@ process_inline:
}
if (f2fs_has_inline_data(inode)) {
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ ipage = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
f2fs_truncate_inline_inode(inode, ipage, 0);
@@ -361,7 +361,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
struct page *ipage;
void *inline_dentry;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_inode_page(sbi, dir->i_ino);
if (IS_ERR(ipage)) {
*res_page = ipage;
return NULL;
@@ -609,7 +609,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_inode_page(sbi, dir->i_ino);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto out_fname;
@@ -644,7 +644,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
struct page *page = NULL;
int err = 0;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_inode_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -734,7 +734,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
void *inline_dentry;
struct f2fs_dentry_ptr d;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
+ ipage = f2fs_get_inode_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return false;
@@ -765,7 +765,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (ctx->pos == d.max)
return 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -797,7 +797,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
struct page *ipage;
int err = 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 3dd25f64d6f1..83f862578fc8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -34,10 +34,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
if (f2fs_inode_dirtied(inode, sync))
return;
- if (f2fs_is_atomic_file(inode)) {
- set_inode_flag(inode, FI_ATOMIC_DIRTIED);
+ if (f2fs_is_atomic_file(inode))
return;
- }
mark_inode_dirty_sync(inode);
}
@@ -410,7 +408,7 @@ static int do_read_inode(struct inode *inode)
if (f2fs_check_nid_range(sbi, inode->i_ino))
return -EINVAL;
- node_page = f2fs_get_node_page(sbi, inode->i_ino);
+ node_page = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
@@ -757,7 +755,7 @@ void f2fs_update_inode_page(struct inode *inode)
struct page *node_page;
int count = 0;
retry:
- node_page = f2fs_get_node_page(sbi, inode->i_ino);
+ node_page = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
int err = PTR_ERR(node_page);
@@ -765,8 +763,12 @@ retry:
if (err == -ENOENT)
return;
+ if (err == -EFSCORRUPTED)
+ goto stop_checkpoint;
+
if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
+stop_checkpoint:
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE);
return;
}
@@ -789,6 +791,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
+ /*
+ * no need to update inode page, ultimately f2fs_evict_inode() will
+ * clear dirty status of inode.
+ */
+ if (f2fs_cp_error(sbi))
+ return -EIO;
+
if (!f2fs_is_checkpoint_ready(sbi)) {
f2fs_mark_inode_dirty_sync(inode, true);
return -ENOSPC;
@@ -804,6 +813,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
}
+static void f2fs_remove_donate_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (list_empty(&F2FS_I(inode)->gdonate_list))
+ return;
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ list_del_init(&F2FS_I(inode)->gdonate_list);
+ sbi->donate_files--;
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+}
+
/*
* Called at the last iput() if i_nlink is zero
*/
@@ -838,6 +860,7 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_bug_on(sbi, get_dirty_pages(inode));
f2fs_remove_dirty_inode(inode);
+ f2fs_remove_donate_inode(inode);
if (!IS_DEVICE_ALIASING(inode))
f2fs_destroy_extent_tree(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a278c7da8177..8f8b9b843bdf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ if (inode->i_nlink == 0) {
+ f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink",
+ __func__, inode->i_ino);
+ err = -EFSCORRUPTED;
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ goto out_iput;
+ }
+
if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
@@ -684,23 +692,23 @@ out_free_encrypted_link:
return err;
}
-static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ return ERR_PTR(-EIO);
err = f2fs_dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -722,12 +730,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
f2fs_sync_fs(sbi->sb, 1);
f2fs_balance_fs(sbi, true);
- return 0;
+ return NULL;
out_fail:
clear_inode_flag(inode, FI_INC_LINK);
f2fs_handle_failed_inode(inode);
- return err;
+ return ERR_PTR(err);
}
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f88392fc4ba9..5f15c224bf78 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
start, nr);
}
-bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio)
{
- return NODE_MAPPING(sbi) == page->mapping &&
- IS_DNODE(page) && is_cold_node(page);
+ return NODE_MAPPING(sbi) == folio->mapping &&
+ IS_DNODE(&folio->page) && is_cold_node(&folio->page);
}
void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
@@ -778,7 +778,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
npage[0] = dn->inode_page;
if (!npage[0]) {
- npage[0] = f2fs_get_node_page(sbi, nids[0]);
+ npage[0] = f2fs_get_inode_page(sbi, nids[0]);
if (IS_ERR(npage[0]))
return PTR_ERR(npage[0]);
}
@@ -1130,26 +1130,33 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
unsigned int nofs = 0;
struct f2fs_inode *ri;
struct dnode_of_data dn;
- struct page *page;
+ struct folio *folio;
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
- if (level < 0) {
+ if (level <= 0) {
+ if (!level) {
+ level = -EFSCORRUPTED;
+ f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u",
+ __func__, inode->i_ino,
+ from, ADDRS_PER_INODE(inode));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
trace_f2fs_truncate_inode_blocks_exit(inode, level);
return level;
}
- page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page)) {
- trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
- return PTR_ERR(page);
+ folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(folio)) {
+ trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio));
+ return PTR_ERR(folio);
}
- set_new_dnode(&dn, inode, page, NULL, 0);
- unlock_page(page);
+ set_new_dnode(&dn, inode, &folio->page, NULL, 0);
+ folio_unlock(folio);
- ri = F2FS_INODE(page);
+ ri = F2FS_INODE(&folio->page);
switch (level) {
case 0:
case 1:
@@ -1178,7 +1185,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
skip_partial:
while (cont) {
- dn.nid = get_nid(page, offset[0], true);
+ dn.nid = get_nid(&folio->page, offset[0], true);
switch (offset[0]) {
case NODE_DIR1_BLOCK:
case NODE_DIR2_BLOCK:
@@ -1199,7 +1206,7 @@ skip_partial:
BUG();
}
if (err == -ENOENT) {
- set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
+ set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
f2fs_err_ratelimited(sbi,
"truncate node fail, ino:%lu, nid:%u, "
@@ -1210,18 +1217,18 @@ skip_partial:
}
if (err < 0)
goto fail;
- if (offset[1] == 0 && get_nid(page, offset[0], true)) {
- lock_page(page);
- BUG_ON(page->mapping != NODE_MAPPING(sbi));
- set_nid(page, offset[0], 0, true);
- unlock_page(page);
+ if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) {
+ folio_lock(folio);
+ BUG_ON(folio->mapping != NODE_MAPPING(sbi));
+ set_nid(&folio->page, offset[0], 0, true);
+ folio_unlock(folio);
}
offset[1] = 0;
offset[0]++;
nofs += err;
}
fail:
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
trace_f2fs_truncate_inode_blocks_exit(inode, err);
return err > 0 ? 0 : err;
}
@@ -1238,7 +1245,7 @@ int f2fs_truncate_xattr_node(struct inode *inode)
if (!nid)
return 0;
- npage = f2fs_get_node_page(sbi, nid);
+ npage = f2fs_get_xnode_page(sbi, nid);
if (IS_ERR(npage))
return PTR_ERR(npage);
@@ -1449,10 +1456,32 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_put_page(apage, err ? 1 : 0);
}
-static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
- struct page *parent, int start)
+static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
+ struct page *page, pgoff_t nid,
+ enum node_type ntype)
{
- struct page *page;
+ if (unlikely(nid != nid_of_node(page) ||
+ (ntype == NODE_TYPE_INODE && !IS_INODE(page)) ||
+ (ntype == NODE_TYPE_XATTR &&
+ !f2fs_has_xattr_block(ofs_of_node(page))) ||
+ time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
+ f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
+ "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ ntype, nid, nid_of_node(page), ino_of_node(page),
+ ofs_of_node(page), cpver_of_node(page),
+ next_blkaddr_of_node(page));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct page *parent, int start,
+ enum node_type ntype)
+{
+ struct folio *folio;
int err;
if (!nid)
@@ -1460,11 +1489,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
if (f2fs_check_nid_range(sbi, nid))
return ERR_PTR(-EINVAL);
repeat:
- page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false);
+ if (IS_ERR(folio))
+ return folio;
- err = read_node_page(page, 0);
+ err = read_node_page(&folio->page, 0);
if (err < 0) {
goto out_put_err;
} else if (err == LOCKED_PAGE) {
@@ -1475,54 +1504,72 @@ repeat:
if (parent)
f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE);
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
+ if (unlikely(folio->mapping != NODE_MAPPING(sbi))) {
+ f2fs_folio_put(folio, true);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
err = -EIO;
goto out_err;
}
- if (!f2fs_inode_chksum_verify(sbi, page)) {
+ if (!f2fs_inode_chksum_verify(sbi, &folio->page)) {
err = -EFSBADCRC;
goto out_err;
}
page_hit:
- if (likely(nid == nid_of_node(page)))
- return page;
-
- f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- nid, nid_of_node(page), ino_of_node(page),
- ofs_of_node(page), cpver_of_node(page),
- next_blkaddr_of_node(page));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
- err = -EFSCORRUPTED;
+ err = sanity_check_node_footer(sbi, &folio->page, nid, ntype);
+ if (!err)
+ return folio;
out_err:
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
out_put_err:
/* ENOENT comes from read_node_page which is not an error. */
if (err != -ENOENT)
- f2fs_handle_page_eio(sbi, page_folio(page), NODE);
- f2fs_put_page(page, 1);
+ f2fs_handle_page_eio(sbi, folio, NODE);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
{
- return __get_node_page(sbi, nid, NULL, 0);
+ struct folio *folio = __get_node_folio(sbi, nid, NULL, 0,
+ NODE_TYPE_REGULAR);
+
+ return &folio->page;
+}
+
+struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino)
+{
+ return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE);
+}
+
+struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino)
+{
+ struct folio *folio = f2fs_get_inode_folio(sbi, ino);
+
+ return &folio->page;
+}
+
+struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid)
+{
+ struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0,
+ NODE_TYPE_XATTR);
+
+ return &folio->page;
}
struct page *f2fs_get_node_page_ra(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
nid_t nid = get_nid(parent, start, false);
+ struct folio *folio = __get_node_folio(sbi, nid, parent, start,
+ NODE_TYPE_REGULAR);
- return __get_node_page(sbi, nid, parent, start);
+ return &folio->page;
}
static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
@@ -1561,11 +1608,11 @@ iput_out:
iput(inode);
}
-static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
+static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
{
pgoff_t index;
struct folio_batch fbatch;
- struct page *last_page = NULL;
+ struct folio *last_folio = NULL;
int nr_folios;
folio_batch_init(&fbatch);
@@ -1577,45 +1624,45 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_put_page(last_page, 0);
+ f2fs_folio_put(last_folio, false);
folio_batch_release(&fbatch);
return ERR_PTR(-EIO);
}
- if (!IS_DNODE(page) || !is_cold_node(page))
+ if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page))
continue;
- if (ino_of_node(page) != ino)
+ if (ino_of_node(&folio->page) != ino)
continue;
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(folio->mapping != NODE_MAPPING(sbi))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (ino_of_node(page) != ino)
+ if (ino_of_node(&folio->page) != ino)
goto continue_unlock;
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
- if (last_page)
- f2fs_put_page(last_page, 0);
+ if (last_folio)
+ f2fs_folio_put(last_folio, false);
- get_page(page);
- last_page = page;
- unlock_page(page);
+ folio_get(folio);
+ last_folio = folio;
+ folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched();
}
- return last_page;
+ return last_folio;
}
static int __write_node_page(struct page *page, bool atomic, bool *submitted,
@@ -1694,7 +1741,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
/* should add to global list before clearing PAGECACHE status */
- if (f2fs_in_warm_node_list(sbi, page)) {
+ if (f2fs_in_warm_node_list(sbi, folio)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
@@ -1769,13 +1816,6 @@ release_page:
return err;
}
-static int f2fs_write_node_page(struct page *page,
- struct writeback_control *wbc)
-{
- return __write_node_page(page, false, NULL, wbc, false,
- FS_NODE_IO, NULL);
-}
-
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id)
@@ -1783,16 +1823,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
pgoff_t index;
struct folio_batch fbatch;
int ret = 0;
- struct page *last_page = NULL;
+ struct folio *last_folio = NULL;
bool marked = false;
nid_t ino = inode->i_ino;
int nr_folios;
int nwritten = 0;
if (atomic) {
- last_page = last_fsync_dnode(sbi, ino);
- if (IS_ERR_OR_NULL(last_page))
- return PTR_ERR_OR_ZERO(last_page);
+ last_folio = last_fsync_dnode(sbi, ino);
+ if (IS_ERR_OR_NULL(last_folio))
+ return PTR_ERR_OR_ZERO(last_folio);
}
retry:
folio_batch_init(&fbatch);
@@ -1804,73 +1844,73 @@ retry:
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
bool submitted = false;
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_put_page(last_page, 0);
+ f2fs_folio_put(last_folio, false);
folio_batch_release(&fbatch);
ret = -EIO;
goto out;
}
- if (!IS_DNODE(page) || !is_cold_node(page))
+ if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page))
continue;
- if (ino_of_node(page) != ino)
+ if (ino_of_node(&folio->page) != ino)
continue;
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(folio->mapping != NODE_MAPPING(sbi))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (ino_of_node(page) != ino)
+ if (ino_of_node(&folio->page) != ino)
goto continue_unlock;
- if (!PageDirty(page) && page != last_page) {
+ if (!folio_test_dirty(folio) && folio != last_folio) {
/* someone wrote it for us */
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, NODE, true, true);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
+ set_fsync_mark(&folio->page, 0);
+ set_dentry_mark(&folio->page, 0);
- if (!atomic || page == last_page) {
- set_fsync_mark(page, 1);
+ if (!atomic || folio == last_folio) {
+ set_fsync_mark(&folio->page, 1);
percpu_counter_inc(&sbi->rf_node_block_count);
- if (IS_INODE(page)) {
+ if (IS_INODE(&folio->page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
- f2fs_update_inode(inode, page);
- set_dentry_mark(page,
+ f2fs_update_inode(inode, &folio->page);
+ set_dentry_mark(&folio->page,
f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
- if (!PageDirty(page))
- set_page_dirty(page);
+ if (!folio_test_dirty(folio))
+ folio_mark_dirty(folio);
}
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- ret = __write_node_page(page, atomic &&
- page == last_page,
+ ret = __write_node_page(&folio->page, atomic &&
+ folio == last_folio,
&submitted, wbc, true,
FS_NODE_IO, seq_id);
if (ret) {
- unlock_page(page);
- f2fs_put_page(last_page, 0);
+ folio_unlock(folio);
+ f2fs_folio_put(last_folio, false);
break;
} else if (submitted) {
nwritten++;
}
- if (page == last_page) {
- f2fs_put_page(page, 0);
+ if (folio == last_folio) {
+ f2fs_folio_put(folio, false);
marked = true;
break;
}
@@ -1883,11 +1923,11 @@ continue_unlock:
}
if (!ret && atomic && !marked) {
f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
- ino, page_folio(last_page)->index);
- lock_page(last_page);
- f2fs_wait_on_page_writeback(last_page, NODE, true, true);
- set_page_dirty(last_page);
- unlock_page(last_page);
+ ino, last_folio->index);
+ folio_lock(last_folio);
+ f2fs_folio_wait_writeback(last_folio, NODE, true, true);
+ folio_mark_dirty(last_folio);
+ folio_unlock(last_folio);
goto retry;
}
out:
@@ -1920,18 +1960,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
return 1;
}
-static bool flush_dirty_inode(struct page *page)
+static bool flush_dirty_inode(struct folio *folio)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
struct inode *inode;
- nid_t ino = ino_of_node(page);
+ nid_t ino = ino_of_node(&folio->page);
inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
if (!inode)
return false;
- f2fs_update_inode(inode, page);
- unlock_page(page);
+ f2fs_update_inode(inode, &folio->page);
+ folio_unlock(folio);
iput(inode);
return true;
@@ -1951,32 +1991,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
- if (!IS_INODE(page))
+ if (!IS_INODE(&folio->page))
continue;
- lock_page(page);
-
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
-continue_unlock:
- unlock_page(page);
- continue;
- }
+ folio_lock(folio);
- if (!PageDirty(page)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
+ if (unlikely(folio->mapping != NODE_MAPPING(sbi)))
+ goto unlock;
+ if (!folio_test_dirty(folio))
+ goto unlock;
/* flush inline_data, if it's async context. */
- if (page_private_inline(page)) {
- clear_page_private_inline(page);
- unlock_page(page);
- flush_inline_data(sbi, ino_of_node(page));
+ if (page_private_inline(&folio->page)) {
+ clear_page_private_inline(&folio->page);
+ folio_unlock(folio);
+ flush_inline_data(sbi, ino_of_node(&folio->page));
continue;
}
- unlock_page(page);
+unlock:
+ folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -2005,7 +2040,7 @@ next_step:
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
bool submitted = false;
/* give a priority to WB_SYNC threads */
@@ -2021,27 +2056,27 @@ next_step:
* 1. dentry dnodes
* 2. file dnodes
*/
- if (step == 0 && IS_DNODE(page))
+ if (step == 0 && IS_DNODE(&folio->page))
continue;
- if (step == 1 && (!IS_DNODE(page) ||
- is_cold_node(page)))
+ if (step == 1 && (!IS_DNODE(&folio->page) ||
+ is_cold_node(&folio->page)))
continue;
- if (step == 2 && (!IS_DNODE(page) ||
- !is_cold_node(page)))
+ if (step == 2 && (!IS_DNODE(&folio->page) ||
+ !is_cold_node(&folio->page)))
continue;
lock_node:
if (wbc->sync_mode == WB_SYNC_ALL)
- lock_page(page);
- else if (!trylock_page(page))
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
continue;
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(folio->mapping != NODE_MAPPING(sbi))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
@@ -2051,29 +2086,29 @@ continue_unlock:
goto write_node;
/* flush inline_data */
- if (page_private_inline(page)) {
- clear_page_private_inline(page);
- unlock_page(page);
- flush_inline_data(sbi, ino_of_node(page));
+ if (page_private_inline(&folio->page)) {
+ clear_page_private_inline(&folio->page);
+ folio_unlock(folio);
+ flush_inline_data(sbi, ino_of_node(&folio->page));
goto lock_node;
}
/* flush dirty inode */
- if (IS_INODE(page) && flush_dirty_inode(page))
+ if (IS_INODE(&folio->page) && flush_dirty_inode(folio))
goto lock_node;
write_node:
- f2fs_wait_on_page_writeback(page, NODE, true, true);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
+ set_fsync_mark(&folio->page, 0);
+ set_dentry_mark(&folio->page, 0);
- ret = __write_node_page(page, false, &submitted,
+ ret = __write_node_page(&folio->page, false, &submitted,
wbc, do_balance, io_type, NULL);
if (ret)
- unlock_page(page);
+ folio_unlock(folio);
else if (submitted)
nwritten++;
@@ -2207,7 +2242,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
* Structure of the f2fs node operations
*/
const struct address_space_operations f2fs_node_aops = {
- .writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
.dirty_folio = f2fs_dirty_node_folio,
.invalidate_folio = f2fs_invalidate_folio,
@@ -2269,24 +2303,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
}
}
-bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- unsigned int i;
- bool ret = true;
-
- f2fs_down_read(&nm_i->nat_tree_lock);
- for (i = 0; i < nm_i->nat_blocks; i++) {
- if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
- ret = false;
- break;
- }
- }
- f2fs_up_read(&nm_i->nat_tree_lock);
-
- return ret;
-}
-
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
bool set, bool build)
{
@@ -2717,7 +2733,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
struct page *ipage;
struct f2fs_inode *ri;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
@@ -2965,23 +2981,7 @@ add_out:
list_add_tail(&nes->set_list, head);
}
-static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
- unsigned int valid)
-{
- if (valid == 0) {
- __set_bit_le(nat_ofs, nm_i->empty_nat_bits);
- __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
- return;
- }
-
- __clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
- if (valid == NAT_ENTRY_PER_BLOCK)
- __set_bit_le(nat_ofs, nm_i->full_nat_bits);
- else
- __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
-}
-
-static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
struct page *page)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -2990,7 +2990,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
int valid = 0;
int i = 0;
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+ if (!enabled_nat_bits(sbi, NULL))
return;
if (nat_index == 0) {
@@ -3001,36 +3001,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
valid++;
}
-
- __update_nat_bits(nm_i, nat_index, valid);
-}
-
-void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- unsigned int nat_ofs;
-
- f2fs_down_read(&nm_i->nat_tree_lock);
-
- for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
- unsigned int valid = 0, nid_ofs = 0;
-
- /* handle nid zero due to it should never be used */
- if (unlikely(nat_ofs == 0)) {
- valid = 1;
- nid_ofs = 1;
- }
-
- for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
- if (!test_bit_le(nid_ofs,
- nm_i->free_nid_bitmap[nat_ofs]))
- valid++;
- }
-
- __update_nat_bits(nm_i, nat_ofs, valid);
+ if (valid == 0) {
+ __set_bit_le(nat_index, nm_i->empty_nat_bits);
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+ return;
}
- f2fs_up_read(&nm_i->nat_tree_lock);
+ __clear_bit_le(nat_index, nm_i->empty_nat_bits);
+ if (valid == NAT_ENTRY_PER_BLOCK)
+ __set_bit_le(nat_index, nm_i->full_nat_bits);
+ else
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3049,7 +3030,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if ((cpc->reason & CP_UMOUNT) ||
+ if (enabled_nat_bits(sbi, cpc) ||
!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
@@ -3096,7 +3077,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
up_write(&curseg->journal_rwsem);
} else {
- update_nat_bits(sbi, start_nid, page);
+ __update_nat_bits(sbi, start_nid, page);
f2fs_put_page(page, 1);
}
@@ -3127,7 +3108,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* during unmount, let's flush nat_bits before checking
* nat_cnt[DIRTY_NAT].
*/
- if (cpc->reason & CP_UMOUNT) {
+ if (enabled_nat_bits(sbi, cpc)) {
f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
f2fs_up_write(&nm_i->nat_tree_lock);
@@ -3143,7 +3124,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (cpc->reason & CP_UMOUNT ||
+ if (enabled_nat_bits(sbi, cpc) ||
!__has_cursum_space(journal,
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
@@ -3180,18 +3161,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
__u64 cp_ver = cur_cp_version(ckpt);
block_t nat_bits_addr;
+ if (!enabled_nat_bits(sbi, NULL))
+ return 0;
+
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kvzalloc(sbi,
F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
- nm_i->full_nat_bits = nm_i->nat_bits + 8;
- nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
-
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
- return 0;
-
nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
@@ -3208,12 +3186,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
cp_ver |= (cur_cp_crc(ckpt) << 32);
if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
- clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
- cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
+ disable_nat_bits(sbi, true);
return 0;
}
+ nm_i->full_nat_bits = nm_i->nat_bits + 8;
+ nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
f2fs_notice(sbi, "Found nat_bits in checkpoint");
return 0;
}
@@ -3224,7 +3203,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
unsigned int i = 0;
nid_t nid, last_nid;
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+ if (!enabled_nat_bits(sbi, NULL))
return;
for (i = 0; i < nm_i->nat_blocks; i++) {
@@ -3296,6 +3275,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
if (!nm_i->nat_bitmap)
return -ENOMEM;
+ if (!test_opt(sbi, NAT_BITS))
+ disable_nat_bits(sbi, true);
+
err = __get_nat_bitmaps(sbi);
if (err)
return err;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 6aea13024ac1..103a437e6425 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -52,6 +52,13 @@ enum {
IS_PREALLOC, /* nat entry is preallocated */
};
+/* For node type in __get_node_folio() */
+enum node_type {
+ NODE_TYPE_REGULAR,
+ NODE_TYPE_INODE,
+ NODE_TYPE_XATTR,
+};
+
/*
* For node information
*/
@@ -248,7 +255,7 @@ static inline nid_t nid_of_node(struct page *node_page)
return le32_to_cpu(rn->footer.nid);
}
-static inline unsigned int ofs_of_node(struct page *node_page)
+static inline unsigned int ofs_of_node(const struct page *node_page)
{
struct f2fs_node *rn = F2FS_NODE(node_page);
unsigned flag = le32_to_cpu(rn->footer.flag);
@@ -342,7 +349,7 @@ static inline bool is_recoverable_dnode(struct page *page)
* `- indirect node ((6 + 2N) + (N - 1)(N + 1))
* `- direct node
*/
-static inline bool IS_DNODE(struct page *node_page)
+static inline bool IS_DNODE(const struct page *node_page)
{
unsigned int ofs = ofs_of_node(node_page);
@@ -389,7 +396,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold data pages in page cache
*/
-static inline int is_node(struct page *page, int type)
+static inline int is_node(const struct page *page, int type)
{
struct f2fs_node *rn = F2FS_NODE(page);
return le32_to_cpu(rn->footer.flag) & BIT(type);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c282e8a0a2ec..396ef71f41e3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2096,7 +2096,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
return false;
if (!force) {
- if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
+ if (!f2fs_realtime_discard_enable(sbi) ||
+ (!se->valid_blocks &&
+ !IS_CURSEG(sbi, cpc->trim_start)) ||
SM_I(sbi)->dcc_info->nr_discards >=
SM_I(sbi)->dcc_info->max_discards)
return false;
@@ -2320,10 +2322,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
- if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+ if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT ||
+ F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
dcc->discard_granularity = BLKS_PER_SEG(sbi);
- else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
- dcc->discard_granularity = BLKS_PER_SEC(sbi);
INIT_LIST_HEAD(&dcc->entry_list);
for (i = 0; i < MAX_PLIST_NUM; i++)
@@ -2806,7 +2807,7 @@ find_other_zone:
MAIN_SECS(sbi));
if (secno >= MAIN_SECS(sbi)) {
ret = -ENOSPC;
- f2fs_bug_on(sbi, 1);
+ f2fs_bug_on(sbi, !pinning);
goto out_unlock;
}
}
@@ -2848,7 +2849,7 @@ got_it:
out_unlock:
spin_unlock(&free_i->segmap_lock);
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC && !pinning)
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
return ret;
}
@@ -2921,6 +2922,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
return curseg->segno;
}
+static void reset_curseg_fields(struct curseg_info *curseg)
+{
+ curseg->inited = false;
+ curseg->segno = NULL_SEGNO;
+ curseg->next_segno = 0;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -2939,7 +2947,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
ret = get_new_segment(sbi, &segno, new_sec, pinning);
if (ret) {
if (ret == -ENOSPC)
- curseg->segno = NULL_SEGNO;
+ reset_curseg_fields(curseg);
return ret;
}
@@ -3710,13 +3718,6 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
-static void reset_curseg_fields(struct curseg_info *curseg)
-{
- curseg->inited = false;
- curseg->segno = NULL_SEGNO;
- curseg->next_segno = 0;
-}
-
int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
@@ -3902,6 +3903,7 @@ static int log_type_to_seg_type(enum log_type type)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
+ struct folio *folio = page_folio(fio->page);
enum log_type type = __get_segment_type(fio);
int seg_type = log_type_to_seg_type(type);
bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
@@ -3912,10 +3914,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio)) {
- if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
+ if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- end_page_writeback(fio->page);
- if (f2fs_in_warm_node_list(fio->sbi, fio->page))
+ folio_end_writeback(folio);
+ if (f2fs_in_warm_node_list(fio->sbi, folio))
f2fs_del_fsync_node_entry(fio->sbi, fio->page);
goto out;
}
@@ -4154,22 +4156,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
f2fs_update_data_blkaddr(dn, new_addr);
}
-void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered, bool locked)
+void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
+ bool ordered, bool locked)
{
- if (folio_test_writeback(page_folio(page))) {
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ if (folio_test_writeback(folio)) {
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
/* submit cached LFS IO */
- f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+ f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type);
/* submit cached IPU IO */
- f2fs_submit_merged_ipu_write(sbi, NULL, page);
+ f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page);
if (ordered) {
- wait_on_page_writeback(page);
- f2fs_bug_on(sbi, locked &&
- folio_test_writeback(page_folio(page)));
+ folio_wait_writeback(folio);
+ f2fs_bug_on(sbi, locked && folio_test_writeback(folio));
} else {
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
}
}
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 943be4f1d6d2..0465dc00b349 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -559,13 +559,16 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int data_blocks,
unsigned int dent_blocks)
{
-
unsigned int segno, left_blocks, blocks;
int i;
/* check current data/node sections in the worst case. */
for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) {
segno = CURSEG_I(sbi, i)->segno;
+
+ if (unlikely(segno == NULL_SEGNO))
+ return false;
+
left_blocks = CAP_BLKS_PER_SEC(sbi) -
get_ckpt_valid_blocks(sbi, segno, true);
@@ -576,6 +579,10 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
/* check current data section for dentry blocks. */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+
+ if (unlikely(segno == NULL_SEGNO))
+ return false;
+
left_blocks = CAP_BLKS_PER_SEC(sbi) -
get_ckpt_valid_blocks(sbi, segno, true);
if (dent_blocks > left_blocks)
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 83d6fb97dcae..9c8d3aee89af 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
mutex_unlock(&sbi->umount_mutex);
}
spin_unlock(&f2fs_list_lock);
- return count;
+ return count ?: SHRINK_EMPTY;
}
unsigned long f2fs_shrink_scan(struct shrinker *shrink,
@@ -130,6 +130,96 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
return freed;
}
+unsigned int f2fs_donate_files(void)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned int donate_files = 0;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ donate_files += sbi->donate_files;
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ return donate_files;
+}
+
+static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi,
+ unsigned int reclaim_caches_kb)
+{
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ unsigned int nfiles = sbi->donate_files;
+ pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10);
+
+ while (npages && nfiles--) {
+ pgoff_t len;
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ if (list_empty(&sbi->inode_list[DONATE_INODE])) {
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ break;
+ }
+ fi = list_first_entry(&sbi->inode_list[DONATE_INODE],
+ struct f2fs_inode_info, gdonate_list);
+ list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+
+ if (!inode)
+ continue;
+
+ len = fi->donate_end - fi->donate_start + 1;
+ npages = npages < len ? 0 : npages - len;
+ invalidate_inode_pages2_range(inode->i_mapping,
+ fi->donate_start, fi->donate_end);
+ iput(inode);
+ cond_resched();
+ }
+ return npages << (PAGE_SHIFT - 10);
+}
+
+void f2fs_reclaim_caches(unsigned int reclaim_caches_kb)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list && reclaim_caches_kb) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+}
+
void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
{
spin_lock(&f2fs_list_lock);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 19b67828ae32..f087b2b71c89 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -63,6 +63,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_BLKADDR_VALIDITY] = "invalid blkaddr",
[FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr",
[FAULT_NO_SEGMENT] = "no free segment",
+ [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer",
};
int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
@@ -190,6 +191,7 @@ enum {
Opt_memory_mode,
Opt_age_extent_cache,
Opt_errors,
+ Opt_nat_bits,
Opt_err,
};
@@ -269,6 +271,7 @@ static match_table_t f2fs_tokens = {
{Opt_memory_mode, "memory=%s"},
{Opt_age_extent_cache, "age_extent_cache"},
{Opt_errors, "errors=%s"},
+ {Opt_nat_bits, "nat_bits"},
{Opt_err, NULL},
};
@@ -383,10 +386,10 @@ static void init_once(void *foo)
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
-static int f2fs_set_qf_name(struct super_block *sb, int qtype,
+static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype,
substring_t *args)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct super_block *sb = sbi->sb;
char *qname;
int ret = -EINVAL;
@@ -424,9 +427,9 @@ errout:
return ret;
}
-static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
+static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct super_block *sb = sbi->sb;
if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
@@ -483,12 +486,11 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
}
#endif
-static int f2fs_set_test_dummy_encryption(struct super_block *sb,
+static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi,
const char *opt,
const substring_t *arg,
bool is_remount)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct fs_parameter param = {
.type = fs_value_is_string,
.string = arg->from ? arg->from : "",
@@ -671,9 +673,8 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
#endif
#endif
-static int parse_options(struct super_block *sb, char *options, bool is_remount)
+static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
substring_t args[MAX_OPT_ARGS];
#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned char (*ext)[F2FS_EXTENSION_LEN];
@@ -687,7 +688,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
int ret;
if (!options)
- goto default_check;
+ return 0;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -728,10 +729,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
set_opt(sbi, DISABLE_ROLL_FORWARD);
break;
case Opt_norecovery:
- /* this option mounts f2fs with ro */
+ /* requires ro mount, checked in f2fs_default_check */
set_opt(sbi, NORECOVERY);
- if (!f2fs_readonly(sb))
- return -EINVAL;
break;
case Opt_discard:
if (!f2fs_hw_support_discard(sbi)) {
@@ -772,16 +771,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
#else
case Opt_user_xattr:
- f2fs_info(sbi, "user_xattr options not supported");
- break;
case Opt_nouser_xattr:
- f2fs_info(sbi, "nouser_xattr options not supported");
- break;
case Opt_inline_xattr:
- f2fs_info(sbi, "inline_xattr options not supported");
- break;
case Opt_noinline_xattr:
- f2fs_info(sbi, "noinline_xattr options not supported");
+ case Opt_inline_xattr_size:
+ f2fs_info(sbi, "xattr options not supported");
break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -793,10 +787,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
#else
case Opt_acl:
- f2fs_info(sbi, "acl options not supported");
- break;
case Opt_noacl:
- f2fs_info(sbi, "noacl options not supported");
+ f2fs_info(sbi, "acl options not supported");
break;
#endif
case Opt_active_logs:
@@ -838,7 +830,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
set_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noextent_cache:
- if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) {
+ if (f2fs_sb_has_device_alias(sbi)) {
f2fs_err(sbi, "device aliasing requires extent cache");
return -EINVAL;
}
@@ -919,18 +911,15 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
break;
#else
case Opt_fault_injection:
- f2fs_info(sbi, "fault_injection options not supported");
- break;
-
case Opt_fault_type:
- f2fs_info(sbi, "fault_type options not supported");
+ f2fs_info(sbi, "fault injection options not supported");
break;
#endif
case Opt_lazytime:
- sb->s_flags |= SB_LAZYTIME;
+ set_opt(sbi, LAZYTIME);
break;
case Opt_nolazytime:
- sb->s_flags &= ~SB_LAZYTIME;
+ clear_opt(sbi, LAZYTIME);
break;
#ifdef CONFIG_QUOTA
case Opt_quota:
@@ -944,32 +933,32 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
set_opt(sbi, PRJQUOTA);
break;
case Opt_usrjquota:
- ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]);
+ ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]);
if (ret)
return ret;
break;
case Opt_grpjquota:
- ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]);
+ ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]);
if (ret)
return ret;
break;
case Opt_prjjquota:
- ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]);
+ ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]);
if (ret)
return ret;
break;
case Opt_offusrjquota:
- ret = f2fs_clear_qf_name(sb, USRQUOTA);
+ ret = f2fs_clear_qf_name(sbi, USRQUOTA);
if (ret)
return ret;
break;
case Opt_offgrpjquota:
- ret = f2fs_clear_qf_name(sb, GRPQUOTA);
+ ret = f2fs_clear_qf_name(sbi, GRPQUOTA);
if (ret)
return ret;
break;
case Opt_offprjjquota:
- ret = f2fs_clear_qf_name(sb, PRJQUOTA);
+ ret = f2fs_clear_qf_name(sbi, PRJQUOTA);
if (ret)
return ret;
break;
@@ -1039,14 +1028,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
kfree(name);
break;
case Opt_test_dummy_encryption:
- ret = f2fs_set_test_dummy_encryption(sb, p, &args[0],
+ ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0],
is_remount);
if (ret)
return ret;
break;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- sb->s_flags |= SB_INLINECRYPT;
+ set_opt(sbi, INLINECRYPT);
#else
f2fs_info(sbi, "inline encryption not supported");
#endif
@@ -1322,13 +1311,20 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_nat_bits:
+ set_opt(sbi, NAT_BITS);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
return -EINVAL;
}
}
-default_check:
+ return 0;
+}
+
+static int f2fs_default_check(struct f2fs_sb_info *sbi)
+{
#ifdef CONFIG_QUOTA
if (f2fs_check_quota_options(sbi))
return -EINVAL;
@@ -1418,6 +1414,12 @@ default_check:
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
}
+
+ if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) {
+ f2fs_err(sbi, "norecovery requires readonly mount");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -1441,6 +1443,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
+ INIT_LIST_HEAD(&fi->gdonate_list);
init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
init_f2fs_rwsem(&fi->i_xattr_sem);
@@ -1527,6 +1530,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync)
inc_page_count(sbi, F2FS_DIRTY_IMETA);
}
spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+ if (!ret && f2fs_is_atomic_file(inode))
+ set_inode_flag(inode, FI_ATOMIC_DIRTIED);
+
return ret;
}
@@ -1737,22 +1744,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
static int f2fs_freeze(struct super_block *sb)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
if (f2fs_readonly(sb))
return 0;
/* IO error happened before */
- if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
/* must be clean, since sync_filesystem() was already called */
- if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY))
return -EINVAL;
+ sbi->umount_lock_holder = current;
+
/* Let's flush checkpoints and stop the thread. */
- f2fs_flush_ckpt_thread(F2FS_SB(sb));
+ f2fs_flush_ckpt_thread(sbi);
+
+ sbi->umount_lock_holder = NULL;
/* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
- set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
+ set_sbi_flag(sbi, SBI_IS_FREEZING);
return 0;
}
@@ -1836,7 +1849,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = total_count - start_count;
spin_lock(&sbi->stat_lock);
-
+ if (sbi->carve_out)
+ buf->f_blocks -= sbi->current_reserved_blocks;
user_block_count = sbi->user_block_count;
total_valid_node_count = valid_node_count(sbi);
avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
@@ -2128,6 +2142,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC)
seq_printf(seq, ",errors=%s", "panic");
+ if (test_opt(sbi, NAT_BITS))
+ seq_puts(seq, ",nat_bits");
+
return 0;
}
@@ -2175,8 +2192,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, MERGE_CHECKPOINT);
+ set_opt(sbi, LAZYTIME);
F2FS_OPTION(sbi).unusable_cap = 0;
- sbi->sb->s_flags |= SB_LAZYTIME;
if (!f2fs_is_readonly(sbi))
set_opt(sbi, FLUSH_MERGE);
if (f2fs_sb_has_blkzoned(sbi))
@@ -2318,6 +2335,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool block_unit_discard = f2fs_block_unit_discard(sbi);
+ bool no_nat_bits = !test_opt(sbi, NAT_BITS);
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2329,6 +2347,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
old_sb_flags = sb->s_flags;
+ sbi->umount_lock_holder = current;
+
#ifdef CONFIG_QUOTA
org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
@@ -2359,7 +2379,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
default_options(sbi, true);
/* parse mount options */
- err = parse_options(sb, data, true);
+ err = parse_options(sbi, data, true);
if (err)
goto restore_opts;
@@ -2374,6 +2394,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
#endif
+ err = f2fs_default_check(sbi);
+ if (err)
+ goto restore_opts;
+
/* flush outstanding errors before changing fs state */
flush_work(&sbi->s_error_work);
@@ -2444,6 +2468,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch nat_bits option is not allowed");
+ goto restore_opts;
+ }
+
if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
err = -EINVAL;
f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
@@ -2552,6 +2582,8 @@ skip:
limit_reserve_root(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
+
+ sbi->umount_lock_holder = NULL;
return 0;
restore_checkpoint:
if (need_enable_checkpoint) {
@@ -2592,6 +2624,8 @@ restore_opts:
#endif
sbi->mount_opt = org_mount_opt;
sb->s_flags = old_sb_flags;
+
+ sbi->umount_lock_holder = NULL;
return err;
}
@@ -2908,7 +2942,7 @@ out:
return ret;
}
-int f2fs_quota_sync(struct super_block *sb, int type)
+int f2fs_do_quota_sync(struct super_block *sb, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
@@ -2956,11 +2990,21 @@ int f2fs_quota_sync(struct super_block *sb, int type)
return ret;
}
+static int f2fs_quota_sync(struct super_block *sb, int type)
+{
+ int ret;
+
+ F2FS_SB(sb)->umount_lock_holder = current;
+ ret = f2fs_do_quota_sync(sb, type);
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+ return ret;
+}
+
static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path)
{
struct inode *inode;
- int err;
+ int err = 0;
/* if quota sysfile exists, deny enabling quota with specific file */
if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) {
@@ -2971,31 +3015,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
if (path->dentry->d_sb != sb)
return -EXDEV;
- err = f2fs_quota_sync(sb, type);
+ F2FS_SB(sb)->umount_lock_holder = current;
+
+ err = f2fs_do_quota_sync(sb, type);
if (err)
- return err;
+ goto out;
inode = d_inode(path->dentry);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
- return err;
+ goto out;
err = filemap_fdatawait(inode->i_mapping);
if (err)
- return err;
+ goto out;
err = dquot_quota_on(sb, type, format_id, path);
if (err)
- return err;
+ goto out;
inode_lock(inode);
F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL;
f2fs_set_inode_flags(inode);
inode_unlock(inode);
f2fs_mark_inode_dirty_sync(inode, false);
-
- return 0;
+out:
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+ return err;
}
static int __f2fs_quota_off(struct super_block *sb, int type)
@@ -3006,7 +3053,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type)
if (!inode || !igrab(inode))
return dquot_quota_off(sb, type);
- err = f2fs_quota_sync(sb, type);
+ err = f2fs_do_quota_sync(sb, type);
if (err)
goto out_put;
@@ -3029,6 +3076,8 @@ static int f2fs_quota_off(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
int err;
+ F2FS_SB(sb)->umount_lock_holder = current;
+
err = __f2fs_quota_off(sb, type);
/*
@@ -3038,6 +3087,9 @@ static int f2fs_quota_off(struct super_block *sb, int type)
*/
if (is_journalled_quota(sbi))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+
return err;
}
@@ -3170,7 +3222,7 @@ int f2fs_dquot_initialize(struct inode *inode)
return 0;
}
-int f2fs_quota_sync(struct super_block *sb, int type)
+int f2fs_do_quota_sync(struct super_block *sb, int type)
{
return 0;
}
@@ -4220,6 +4272,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason)
if (shutdown)
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ else
+ dump_stack();
/*
* Continue filesystem operators if errors=continue. Should not set
@@ -4495,7 +4549,11 @@ try_onemore:
goto free_sb_buf;
}
- err = parse_options(sb, options, false);
+ err = parse_options(sbi, options, false);
+ if (err)
+ goto free_options;
+
+ err = f2fs_default_check(sbi);
if (err)
goto free_options;
@@ -4533,6 +4591,14 @@ try_onemore:
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
+ if (test_opt(sbi, INLINECRYPT))
+ sb->s_flags |= SB_INLINECRYPT;
+
+ if (test_opt(sbi, LAZYTIME))
+ sb->s_flags |= SB_LAZYTIME;
+ else
+ sb->s_flags &= ~SB_LAZYTIME;
+
super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid));
super_set_sysfs_name_bdev(sb);
sb->s_iflags |= SB_I_CGROUPWB;
@@ -4703,6 +4769,7 @@ try_onemore:
if (err)
goto free_compress_inode;
+ sbi->umount_lock_holder = current;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount */
if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
@@ -4718,8 +4785,10 @@ try_onemore:
if (err)
goto free_meta;
- if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
+ if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) {
+ skip_recovery = true;
goto reset_checkpoint;
+ }
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
@@ -4769,10 +4838,10 @@ try_onemore:
}
}
+reset_checkpoint:
#ifdef CONFIG_QUOTA
f2fs_recover_quota_end(sbi, quota_enabled);
#endif
-reset_checkpoint:
/*
* If the f2fs is not readonly and fsync data recovery succeeds,
* write pointer consistency of cursegs and other zones are already
@@ -4829,6 +4898,8 @@ reset_checkpoint:
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+
+ sbi->umount_lock_holder = NULL;
return 0;
sync_free_meta:
@@ -4931,6 +5002,8 @@ static void kill_f2fs_super(struct super_block *sb)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (sb->s_root) {
+ sbi->umount_lock_holder = current;
+
set_sbi_flag(sbi, SBI_IS_CLOSE);
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index d15c68b28952..c69161366467 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -61,6 +61,12 @@ struct f2fs_attr {
int id;
};
+struct f2fs_base_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct f2fs_base_attr *a, char *buf);
+ ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len);
+};
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf);
@@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
+static ssize_t f2fs_base_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_base_attr *a = container_of(attr,
+ struct f2fs_base_attr, attr);
+
+ return a->show ? a->show(a, buf) : 0;
+}
+
+static ssize_t f2fs_base_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_base_attr *a = container_of(attr,
+ struct f2fs_base_attr, attr);
+
+ return a->store ? a->store(a, buf, len) : 0;
+}
+
/*
* Note that there are three feature list entries:
* 1) /sys/fs/f2fs/features
@@ -880,18 +905,50 @@ static void f2fs_sb_release(struct kobject *kobj)
* please add new on-disk feature in this list only.
* - ref. F2FS_SB_FEATURE_RO_ATTR()
*/
-static ssize_t f2fs_feature_show(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi, char *buf)
+static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf)
{
return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
-static struct f2fs_attr f2fs_attr_##_name = { \
+static struct f2fs_base_attr f2fs_base_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_feature_show, \
}
+static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf)
+{
+ unsigned int res = 0;
+
+ if (!strcmp(a->attr.name, "reclaim_caches_kb"))
+ res = f2fs_donate_files();
+
+ return sysfs_emit(buf, "%u\n", res);
+}
+
+static ssize_t f2fs_tune_store(struct f2fs_base_attr *a,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (!strcmp(a->attr.name, "reclaim_caches_kb"))
+ f2fs_reclaim_caches(t);
+
+ return count;
+}
+
+#define F2FS_TUNE_RW_ATTR(_name) \
+static struct f2fs_base_attr f2fs_base_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0644 }, \
+ .show = f2fs_tune_show, \
+ .store = f2fs_tune_store, \
+}
+
static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -1065,6 +1122,7 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count);
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
+F2FS_SBI_GENERAL_RW_ATTR(carve_out);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1252,41 +1310,43 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(warm_data_age_threshold),
ATTR_LIST(last_age_weight),
ATTR_LIST(max_read_extent_count),
+ ATTR_LIST(carve_out),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
+#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr)
static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
- ATTR_LIST(encryption),
- ATTR_LIST(test_dummy_encryption_v2),
+ BASE_ATTR_LIST(encryption),
+ BASE_ATTR_LIST(test_dummy_encryption_v2),
#if IS_ENABLED(CONFIG_UNICODE)
- ATTR_LIST(encrypted_casefold),
+ BASE_ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
- ATTR_LIST(block_zoned),
+ BASE_ATTR_LIST(block_zoned),
#endif
- ATTR_LIST(atomic_write),
- ATTR_LIST(extra_attr),
- ATTR_LIST(project_quota),
- ATTR_LIST(inode_checksum),
- ATTR_LIST(flexible_inline_xattr),
- ATTR_LIST(quota_ino),
- ATTR_LIST(inode_crtime),
- ATTR_LIST(lost_found),
+ BASE_ATTR_LIST(atomic_write),
+ BASE_ATTR_LIST(extra_attr),
+ BASE_ATTR_LIST(project_quota),
+ BASE_ATTR_LIST(inode_checksum),
+ BASE_ATTR_LIST(flexible_inline_xattr),
+ BASE_ATTR_LIST(quota_ino),
+ BASE_ATTR_LIST(inode_crtime),
+ BASE_ATTR_LIST(lost_found),
#ifdef CONFIG_FS_VERITY
- ATTR_LIST(verity),
+ BASE_ATTR_LIST(verity),
#endif
- ATTR_LIST(sb_checksum),
+ BASE_ATTR_LIST(sb_checksum),
#if IS_ENABLED(CONFIG_UNICODE)
- ATTR_LIST(casefold),
+ BASE_ATTR_LIST(casefold),
#endif
- ATTR_LIST(readonly),
+ BASE_ATTR_LIST(readonly),
#ifdef CONFIG_F2FS_FS_COMPRESSION
- ATTR_LIST(compression),
+ BASE_ATTR_LIST(compression),
#endif
- ATTR_LIST(pin_file),
+ BASE_ATTR_LIST(pin_file),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
@@ -1343,6 +1403,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
};
ATTRIBUTE_GROUPS(f2fs_sb_feat);
+F2FS_TUNE_RW_ATTR(reclaim_caches_kb);
+
+static struct attribute *f2fs_tune_attrs[] = {
+ BASE_ATTR_LIST(reclaim_caches_kb),
+ NULL,
+};
+ATTRIBUTE_GROUPS(f2fs_tune);
+
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
@@ -1362,15 +1430,34 @@ static struct kset f2fs_kset = {
.kobj = {.ktype = &f2fs_ktype},
};
+static const struct sysfs_ops f2fs_feat_attr_ops = {
+ .show = f2fs_base_attr_show,
+ .store = f2fs_base_attr_store,
+};
+
static const struct kobj_type f2fs_feat_ktype = {
.default_groups = f2fs_feat_groups,
- .sysfs_ops = &f2fs_attr_ops,
+ .sysfs_ops = &f2fs_feat_attr_ops,
};
static struct kobject f2fs_feat = {
.kset = &f2fs_kset,
};
+static const struct sysfs_ops f2fs_tune_attr_ops = {
+ .show = f2fs_base_attr_show,
+ .store = f2fs_base_attr_store,
+};
+
+static const struct kobj_type f2fs_tune_ktype = {
+ .default_groups = f2fs_tune_groups,
+ .sysfs_ops = &f2fs_tune_attr_ops,
+};
+
+static struct kobject f2fs_tune = {
+ .kset = &f2fs_kset,
+};
+
static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
@@ -1607,6 +1694,11 @@ int __init f2fs_init_sysfs(void)
if (ret)
goto put_kobject;
+ ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype,
+ NULL, "tuning");
+ if (ret)
+ goto put_kobject;
+
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
if (!f2fs_proc_root) {
ret = -ENOMEM;
@@ -1614,7 +1706,9 @@ int __init f2fs_init_sysfs(void)
}
return 0;
+
put_kobject:
+ kobject_put(&f2fs_tune);
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
return ret;
@@ -1622,6 +1716,7 @@ put_kobject:
void f2fs_exit_sysfs(void)
{
+ kobject_put(&f2fs_tune);
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
remove_proc_entry("fs/f2fs", NULL);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3f3874943679..c691b35618ad 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -282,7 +282,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage,
if (ipage) {
inline_addr = inline_xattr_addr(inode, ipage);
} else {
- page = f2fs_get_node_page(sbi, inode->i_ino);
+ page = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -303,7 +303,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
void *xattr_addr;
/* The inode already has an extended attribute block. */
- xpage = f2fs_get_node_page(sbi, xnid);
+ xpage = f2fs_get_xnode_page(sbi, xnid);
if (IS_ERR(xpage))
return PTR_ERR(xpage);
@@ -449,7 +449,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(inode, ipage);
} else {
- in_page = f2fs_get_node_page(sbi, inode->i_ino);
+ in_page = f2fs_get_inode_page(sbi, inode->i_ino);
if (IS_ERR(in_page)) {
f2fs_alloc_nid_failed(sbi, new_nid);
return PTR_ERR(in_page);
@@ -475,7 +475,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
/* write to xattr node block */
if (F2FS_I(inode)->i_xattr_nid) {
- xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid);
if (IS_ERR(xpage)) {
err = PTR_ERR(xpage);
f2fs_alloc_nid_failed(sbi, new_nid);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index f06f6ba643cc..23e9b9371ec3 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -339,8 +339,8 @@ out:
}
/***** Make a directory */
-static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
@@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mutex_unlock(&MSDOS_SB(sb)->s_lock);
fat_flush_inodes(sb, dir, inode);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
/***** Unlink a file */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 926c26e90ef8..dd910edd2404 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -841,8 +841,8 @@ out:
return err;
}
-static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
diff --git a/fs/file.c b/fs/file.c
index d868cdb95d1e..dc3f7e120e3e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,6 +26,28 @@
#include "internal.h"
+bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+{
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+
/**
* __file_ref_put - Slowpath of file_ref_put()
* @ref: Pointer to the reference count
@@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
return true;
}
- /*
- * If the reference count was already in the dead zone, then this
- * put() operation is imbalanced. Warn, put the reference count back to
- * DEAD and tell the caller to not deconstruct the object.
- */
- if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
- atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
- return false;
- }
-
- /*
- * This is a put() operation on a saturated refcount. Restore the
- * mean saturation value and tell the caller to not deconstruct the
- * object.
- */
- if (cnt > FILE_REF_MAXREF)
- atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
- return false;
+ return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);
@@ -418,17 +423,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -577,6 +590,7 @@ repeat:
__set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
out:
spin_unlock(&files->file_lock);
@@ -612,22 +626,14 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array. At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us. We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
+/**
+ * fd_install - install a file pointer in the fd array
+ * @fd: file descriptor to install the file in
+ * @file: the file to install
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
-
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
@@ -642,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- WARN_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -650,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file)
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
@@ -679,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -1178,8 +1184,23 @@ struct fd fdget_raw(unsigned int fd)
*/
static inline bool file_needs_f_pos_lock(struct file *file)
{
- return (file->f_mode & FMODE_ATOMIC_POS) &&
- (file_count(file) > 1 || file->f_op->iterate_shared);
+ if (!(file->f_mode & FMODE_ATOMIC_POS))
+ return false;
+ if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
+ return true;
+ if (file->f_op->iterate_shared)
+ return true;
+ return false;
+}
+
+bool file_seek_cur_needs_f_lock(struct file *file)
+{
+ if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
+ return false;
+
+ VFS_WARN_ON_ONCE((file_count(file) > 1) &&
+ !mutex_is_locked(&file->f_pos_lock));
+ return true;
}
struct fd fdget_pos(unsigned int fd)
@@ -1187,7 +1208,7 @@ struct fd fdget_pos(unsigned int fd)
struct fd f = fdget(fd);
struct file *file = fd_file(f);
- if (file && file_needs_f_pos_lock(file)) {
+ if (likely(file) && file_needs_f_pos_lock(file)) {
f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
@@ -1230,14 +1251,34 @@ __releases(&files->file_lock)
struct fdtable *fdt;
/*
- * We need to detect attempts to do dup2() over allocated but still
- * not finished descriptor.
+ * dup2() is expected to close the file installed in the target fd slot
+ * (if any). However, userspace hand-picking a fd may be racing against
+ * its own threads which happened to allocate it in open() et al but did
+ * not populate it yet.
+ *
+ * Broadly speaking we may be racing against the following:
+ * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
+ * file = hard_work_goes_here();
+ * fd_install(fd, file); // only now ->fd[fd] == file
+ *
+ * It is an invariant that a successfully allocated fd has a NULL entry
+ * in the array until the matching fd_install().
+ *
+ * If we fit the window, we have the fd to populate, yet no target file
+ * to close. Trying to ignore it and install our new file would violate
+ * the invariant and make fd_install() overwrite our file.
+ *
+ * Things can be done(tm) to handle this. However, the issue does not
+ * concern legitimate programs and we only need to make sure the kernel
+ * does not trip over it.
+ *
+ * The simplest way out is to return an error if we find ourselves here.
*
* POSIX is silent on the issue, we return -EBUSY.
*/
fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds);
- tofree = fdt->fd[fd];
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c00dc38558d..c04ed94cdc4b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Privileged users can go above max_files
*/
- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+ if (unlikely(get_nr_files() >= files_stat.max_files) &&
+ !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
@@ -511,31 +512,37 @@ void flush_delayed_fput(void)
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);
-void fput(struct file *file)
+static void __fput_deferred(struct file *file)
{
- if (file_ref_put(&file->f_ref)) {
- struct task_struct *task = current;
+ struct task_struct *task = current;
+
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
- if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
- file_free(file);
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
- }
- if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_task_work, ____fput);
- if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
- return;
- /*
- * After this task has run exit_task_work(),
- * task_work_add() will fail. Fall through to delayed
- * fput to avoid leaking *file.
- */
- }
-
- if (llist_add(&file->f_llist, &delayed_fput_list))
- schedule_delayed_work(&delayed_fput_work, 1);
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
+
+ if (llist_add(&file->f_llist, &delayed_fput_list))
+ schedule_delayed_work(&delayed_fput_work, 1);
}
+void fput(struct file *file)
+{
+ if (unlikely(file_ref_put(&file->f_ref)))
+ __fput_deferred(file);
+}
+EXPORT_SYMBOL(fput);
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -549,10 +556,32 @@ void __fput_sync(struct file *file)
if (file_ref_put(&file->f_ref))
__fput(file);
}
-
-EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);
+/*
+ * Equivalent to __fput_sync(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close_sync(struct file *file)
+{
+ if (likely(file_ref_put_close(&file->f_ref)))
+ __fput(file);
+}
+
+/*
+ * Equivalent to fput(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close(struct file *file)
+{
+ if (file_ref_put_close(&file->f_ref))
+ __fput_deferred(file);
+}
+
void __init files_init(void)
{
struct kmem_cache_args args = {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3cd99e2dc6ac..cc57367fb641 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -65,7 +65,7 @@ struct wb_writeback_work {
* timestamps written to disk after 12 hours, but in the worst case a
* few inodes might not their timestamps updated for 24 hours.
*/
-unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+static unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
@@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}
-static int __init start_dirtytime_writeback(void)
-{
- schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
- return 0;
-}
-__initcall(start_dirtytime_writeback);
-
-int dirtytime_interval_handler(const struct ctl_table *table, int write,
+static int dirtytime_interval_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write,
return ret;
}
+static const struct ctl_table vm_fs_writeback_table[] = {
+ {
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirtytime_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
+static int __init start_dirtytime_writeback(void)
+{
+ schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+ register_sysctl_init("vm", vm_fs_writeback_table);
+ return 0;
+}
+__initcall(start_dirtytime_writeback);
+
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 094a7f510edf..1aaf4cb2afb2 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_FD:
param.type = fs_value_is_file;
ret = -EBADF;
- param.file = fget(aux);
+ param.file = fget_raw(aux);
if (!param.file)
goto out_key;
param.dirfd = aux;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3805f9b06c9d..fa8f1141ea74 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -781,9 +781,9 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
- struct fuse_args *args, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
@@ -792,11 +792,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
struct fuse_forget_link *forget;
if (fuse_is_bad(dir))
- return -EIO;
+ return ERR_PTR(-EIO);
forget = fuse_alloc_forget();
if (!forget)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memset(&outarg, 0, sizeof(outarg));
args->nodeid = get_node_id(dir);
@@ -826,29 +826,43 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
&outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0);
if (!inode) {
fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
kfree(forget);
d_drop(entry);
d = d_splice_alias(inode, entry);
if (IS_ERR(d))
- return PTR_ERR(d);
+ return d;
- if (d) {
+ if (d)
fuse_change_entry_timeout(d, &outarg);
- dput(d);
- } else {
+ else
fuse_change_entry_timeout(entry, &outarg);
- }
fuse_dir_changed(dir);
- return 0;
+ return d;
out_put_forget_req:
if (err == -EEXIST)
fuse_invalidate_entry(entry);
kfree(forget);
- return err;
+ return ERR_PTR(err);
+}
+
+static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ /*
+ * Note that when creating anything other than a directory we
+ * can be sure create_new_entry() will NOT return an alternate
+ * dentry as d_splice_alias() only returns an alternate dentry
+ * for directories. So we don't need to check for that case
+ * when passing back the result.
+ */
+ WARN_ON_ONCE(S_ISDIR(mode));
+
+ return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode));
}
static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -871,7 +885,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(idmap, fm, &args, dir, entry, mode);
+ return create_new_nondir(idmap, fm, &args, dir, entry, mode);
}
static int fuse_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -898,8 +912,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
return err;
}
-static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_mount *fm = get_fuse_mount(dir);
@@ -934,7 +948,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[1].value = entry->d_name.name;
args.in_args[2].size = len;
args.in_args[2].value = link;
- return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
+ return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK);
}
void fuse_flush_time_update(struct inode *inode)
@@ -1131,7 +1145,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
+ err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
if (!err)
fuse_update_ctime_in_cache(inode);
else if (err == -EINTR)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..366516b98b3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
- return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+ return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
+ NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c9bb3be21d2b..fd1147aa3891 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -820,7 +820,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*
* Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
@@ -885,7 +885,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*
* For writes, iomap_dio_rw only triggers manual page faults, so we
@@ -957,7 +957,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*/
@@ -1024,7 +1024,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*/
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 65c07aa95718..d7220a6fe8f5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -607,14 +607,19 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
if (gh && (ret & LM_OUT_CANCELED))
gfs2_holder_wake(gh);
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
- /* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
- list_move_tail(&gh->gh_list, &gl->gl_holders);
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
+ gl->gl_target = gl->gl_state;
gh = find_first_waiter(gl);
- gl->gl_target = gh->gh_state;
- if (do_promote(gl))
- goto out;
- goto retry;
+ if (gh) {
+ gl->gl_target = gh->gh_state;
+ if (do_promote(gl))
+ goto out;
+ do_xmote(gl, gh, gl->gl_target);
+ return;
+ }
+ goto out;
}
/* Some error or failed "try lock" - report it */
if ((ret & LM_OUT_ERROR) ||
@@ -627,7 +632,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
switch(state) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED:
-retry:
do_xmote(gl, gh, gl->gl_target);
break;
/* Conversion fails, unlock and try again */
@@ -661,7 +665,8 @@ retry:
do_promote(gl);
}
out:
- clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!test_bit(GLF_CANCELING, &gl->gl_flags))
+ clear_bit(GLF_LOCK, &gl->gl_flags);
}
static bool is_system_glock(struct gfs2_glock *gl)
@@ -807,6 +812,7 @@ skip_inval:
}
if (ls->ls_ops->lm_lock) {
+ set_bit(GLF_PENDING_REPLY, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
ret = ls->ls_ops->lm_lock(gl, target, lck_flags);
spin_lock(&gl->gl_lockref.lock);
@@ -825,6 +831,7 @@ skip_inval:
/* The operation will be completed asynchronously. */
return;
}
+ clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
}
/* Complete the operation now. */
@@ -843,12 +850,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh = NULL;
+ struct gfs2_holder *gh;
if (test_bit(GLF_LOCK, &gl->gl_flags))
return;
set_bit(GLF_LOCK, &gl->gl_flags);
+ /* While a demote is in progress, the GLF_LOCK flag must be set. */
GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
@@ -860,18 +868,22 @@ __acquires(&gl->gl_lockref.lock)
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
+ do_xmote(gl, NULL, gl->gl_target);
+ return;
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
if (do_promote(gl))
goto out_unlock;
gh = find_first_waiter(gl);
+ if (!gh)
+ goto out_unlock;
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
+ do_xmote(gl, gh, gl->gl_target);
+ return;
}
- do_xmote(gl, gh, gl->gl_target);
- return;
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -898,12 +910,8 @@ void glock_set_object(struct gfs2_glock *gl, void *object)
prev_object = gl->gl_object;
gl->gl_object = object;
spin_unlock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
- pr_warn("glock=%u/%llx\n",
- gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL))
gfs2_dump_glock(NULL, gl, true);
- }
}
/**
@@ -919,12 +927,8 @@ void glock_clear_object(struct gfs2_glock *gl, void *object)
prev_object = gl->gl_object;
gl->gl_object = NULL;
spin_unlock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) {
- pr_warn("glock=%u/%llx\n",
- gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object))
gfs2_dump_glock(NULL, gl, true);
- }
}
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
@@ -959,6 +963,25 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
gfs2_holder_uninit(&gh);
}
+static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip;
+
+ spin_lock(&gl->gl_lockref.lock);
+ ip = gl->gl_object;
+ if (ip && !igrab(&ip->i_inode))
+ ip = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (ip) {
+ wait_on_inode(&ip->i_inode);
+ if (is_bad_inode(&ip->i_inode)) {
+ iput(&ip->i_inode);
+ ip = NULL;
+ }
+ }
+ return ip;
+}
+
static void gfs2_try_evict(struct gfs2_glock *gl)
{
struct gfs2_inode *ip;
@@ -976,32 +999,15 @@ static void gfs2_try_evict(struct gfs2_glock *gl)
* happened below. (Verification is triggered by the call to
* gfs2_queue_verify_delete() in gfs2_evict_inode().)
*/
- spin_lock(&gl->gl_lockref.lock);
- ip = gl->gl_object;
- if (ip && !igrab(&ip->i_inode))
- ip = NULL;
- spin_unlock(&gl->gl_lockref.lock);
- if (ip) {
- wait_on_inode(&ip->i_inode);
- if (is_bad_inode(&ip->i_inode)) {
- iput(&ip->i_inode);
- ip = NULL;
- }
- }
+ ip = gfs2_grab_existing_inode(gl);
if (ip) {
- set_bit(GIF_DEFER_DELETE, &ip->i_flags);
+ set_bit(GLF_DEFER_DELETE, &gl->gl_flags);
d_prune_aliases(&ip->i_inode);
iput(&ip->i_inode);
+ clear_bit(GLF_DEFER_DELETE, &gl->gl_flags);
/* If the inode was evicted, gl->gl_object will now be NULL. */
- spin_lock(&gl->gl_lockref.lock);
- ip = gl->gl_object;
- if (ip) {
- clear_bit(GIF_DEFER_DELETE, &ip->i_flags);
- if (!igrab(&ip->i_inode))
- ip = NULL;
- }
- spin_unlock(&gl->gl_lockref.lock);
+ ip = gfs2_grab_existing_inode(gl);
if (ip) {
gfs2_glock_poke(ip->i_gl);
iput(&ip->i_inode);
@@ -1462,9 +1468,7 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
{
if (!(gh->gh_flags & GL_NOPID))
return true;
- if (gh->gh_state == LM_ST_UNLOCKED)
- return true;
- return false;
+ return !test_bit(HIF_HOLDER, &gh->gh_iflags);
}
/**
@@ -1483,7 +1487,6 @@ __acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
@@ -1519,21 +1522,11 @@ fail:
gfs2_holder_wake(gh);
return;
}
- if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
- continue;
}
trace_gfs2_glock_queue(gh, 1);
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
- if (likely(insert_pt == NULL)) {
- list_add_tail(&gh->gh_list, &gl->gl_holders);
- return;
- }
- list_add_tail(&gh->gh_list, insert_pt);
- spin_unlock(&gl->gl_lockref.lock);
- if (sdp->sd_lockstruct.ls_ops->lm_cancel)
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_lockref.lock);
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
return;
trap_recursive:
@@ -1673,11 +1666,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
}
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ test_bit(GLF_LOCK, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ !test_bit(GLF_CANCELING, &gl->gl_flags)) {
+ set_bit(GLF_CANCELING, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl);
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
spin_lock(&gl->gl_lockref.lock);
+ clear_bit(GLF_CANCELING, &gl->gl_flags);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!gfs2_holder_queued(gh))
+ goto out;
}
/*
@@ -1923,6 +1924,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
spin_lock(&gl->gl_lockref.lock);
+ clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
@@ -2323,6 +2325,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'f';
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
*p++ = 'i';
+ if (test_bit(GLF_PENDING_REPLY, gflags))
+ *p++ = 'R';
if (test_bit(GLF_HAVE_REPLY, gflags))
*p++ = 'r';
if (test_bit(GLF_INITIAL, gflags))
@@ -2347,6 +2351,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'e';
if (test_bit(GLF_VERIFY_DELETE, gflags))
*p++ = 'E';
+ if (test_bit(GLF_DEFER_DELETE, gflags))
+ *p++ = 's';
+ if (test_bit(GLF_CANCELING, gflags))
+ *p++ = 'C';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4e19cce3d906..74abbd4970f8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -330,6 +330,9 @@ enum {
GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */
GLF_TRY_TO_EVICT = 17, /* iopen glocks only */
GLF_VERIFY_DELETE = 18, /* iopen glocks only */
+ GLF_PENDING_REPLY = 19,
+ GLF_DEFER_DELETE = 20, /* iopen glocks only */
+ GLF_CANCELING = 21,
};
struct gfs2_glock {
@@ -376,7 +379,6 @@ enum {
GIF_SW_PAGED = 3,
GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
- GIF_DEFER_DELETE = 7,
};
struct gfs2_inode {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6fbbaaad1cd0..198a8cbaf5e5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1248,14 +1248,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir,
* @dentry: The dentry of the new directory
* @mode: The mode of the new directory
*
- * Returns: errno
+ * Returns: the dentry, or ERR_PTR(errno)
*/
-static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir));
- return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0);
+
+ return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0));
}
/**
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 314ec2a70167..0fd3b5ec7d8c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -157,7 +157,9 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock)
/**
* gfs2_end_log_write_bh - end log write of pagecache data with buffers
* @sdp: The superblock
- * @bvec: The bio_vec
+ * @folio: The folio
+ * @offset: The first byte within the folio that completed
+ * @size: The number of bytes that completed
* @error: The i/o status
*
* This finds the relevant buffers and unlocks them and sets the
@@ -166,17 +168,13 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock)
* that is pinned in the pagecache.
*/
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
- struct bio_vec *bvec,
- blk_status_t error)
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct folio *folio,
+ size_t offset, size_t size, blk_status_t error)
{
struct buffer_head *bh, *next;
- struct page *page = bvec->bv_page;
- unsigned size;
- bh = page_buffers(page);
- size = bvec->bv_len;
- while (bh_offset(bh) < bvec->bv_offset)
+ bh = folio_buffers(folio);
+ while (bh_offset(bh) < offset)
bh = bh->b_this_page;
do {
if (error)
@@ -186,7 +184,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
size -= bh->b_size;
brelse(bh);
bh = next;
- } while(bh && size);
+ } while (bh && size);
}
/**
@@ -203,7 +201,6 @@ static void gfs2_end_log_write(struct bio *bio)
{
struct gfs2_sbd *sdp = bio->bi_private;
struct bio_vec *bvec;
- struct page *page;
struct bvec_iter_all iter_all;
if (bio->bi_status) {
@@ -217,9 +214,12 @@ static void gfs2_end_log_write(struct bio *bio)
}
bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
- if (page_has_buffers(page))
- gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
+ struct page *page = bvec->bv_page;
+ struct folio *folio = page_folio(page);
+
+ if (folio && folio_buffers(folio))
+ gfs2_end_log_write_bh(sdp, folio, bvec->bv_offset,
+ bvec->bv_len, bio->bi_status);
else
mempool_free(page, gfs2_page_pool);
}
@@ -359,8 +359,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head);
gfs2_log_incr_head(sdp);
- gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size,
- bh_offset(bh), dblock);
+ gfs2_log_write(sdp, sdp->sd_jdesc, folio_page(bh->b_folio, 0),
+ bh->b_size, bh_offset(bh), dblock);
}
/**
@@ -406,17 +406,16 @@ static void gfs2_end_log_read(struct bio *bio)
}
/**
- * gfs2_jhead_pg_srch - Look for the journal head in a given page.
+ * gfs2_jhead_folio_search - Look for the journal head in a given page.
* @jd: The journal descriptor
* @head: The journal head to start from
- * @page: The page to look in
+ * @folio: The folio to look in
*
* Returns: 1 if found, 0 otherwise.
*/
-
-static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head,
- struct page *page)
+static bool gfs2_jhead_folio_search(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head,
+ struct folio *folio)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host lh;
@@ -424,7 +423,8 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
unsigned int offset;
bool ret = false;
- kaddr = kmap_local_page(page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ kaddr = kmap_local_folio(folio, 0);
for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
if (lh.lh_sequence >= head->lh_sequence)
@@ -472,7 +472,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
*done = true;
if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, &folio->page);
+ *done = gfs2_jhead_folio_search(jd, head, folio);
/* filemap_get_folio() and the earlier grab_cache_page() */
folio_put_refs(folio, 2);
@@ -512,9 +512,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
unsigned int shift = PAGE_SHIFT - bsize_shift;
unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
- int sz, ret = 0;
+ int ret = 0;
struct bio *bio = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
bool done = false;
errseq_t since;
@@ -527,10 +527,11 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
u64 dblock = je->dblock;
for (; block < je->lblock + je->blocks; block++, dblock++) {
- if (!page) {
- page = grab_cache_page(mapping, block >> shift);
- if (!page) {
- ret = -ENOMEM;
+ if (!folio) {
+ folio = filemap_grab_folio(mapping,
+ block >> shift);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
done = true;
goto out;
}
@@ -541,8 +542,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
sector_t sector = dblock << sdp->sd_fsb2bb_shift;
if (bio_end_sector(bio) == sector) {
- sz = bio_add_page(bio, page, bsize, off);
- if (sz == bsize)
+ if (bio_add_folio(bio, folio, bsize, off))
goto block_added;
}
if (off) {
@@ -562,12 +562,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
add_block_to_new_bio:
- sz = bio_add_page(bio, page, bsize, off);
- BUG_ON(sz != bsize);
+ if (!bio_add_folio(bio, folio, bsize, off))
+ BUG();
block_added:
off += bsize;
- if (off == PAGE_SIZE)
- page = NULL;
+ if (off == folio_size(folio))
+ folio = NULL;
if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
@@ -615,15 +615,13 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
static void gfs2_check_magic(struct buffer_head *bh)
{
- void *kaddr;
__be32 *ptr;
clear_buffer_escaped(bh);
- kaddr = kmap_local_page(bh->b_page);
- ptr = kaddr + bh_offset(bh);
+ ptr = kmap_local_folio(bh->b_folio, bh_offset(bh));
if (*ptr == cpu_to_be32(GFS2_MAGIC))
set_buffer_escaped(bh);
- kunmap_local(kaddr);
+ kunmap_local(ptr);
}
static int blocknr_cmp(void *priv, const struct list_head *a,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fea3efcc2f93..198cc7056637 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -198,15 +198,14 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
static void gfs2_meta_read_endio(struct bio *bio)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct buffer_head *bh = page_buffers(page);
- unsigned int len = bvec->bv_len;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct buffer_head *bh = folio_buffers(folio);
+ size_t len = fi.length;
- while (bh_offset(bh) < bvec->bv_offset)
+ while (bh_offset(bh) < fi.offset)
bh = bh->b_this_page;
do {
struct buffer_head *next = bh->b_this_page;
@@ -232,7 +231,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
while (num > 0) {
bh = *bhs;
- if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
+ if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) {
BUG_ON(bio->bi_iter.bi_size == 0);
break;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 92a3b6ddafdc..44e5658b896c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1329,7 +1329,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags)))
goto should_delete;
- if (test_bit(GIF_DEFER_DELETE, &ip->i_flags))
+ if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
+ test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags))
return EVICT_SHOULD_DEFER_DELETE;
/* Deletes should never happen under memory pressure anymore. */
@@ -1338,12 +1339,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
/* Must not read inode block until block type has been verified */
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh);
- if (unlikely(ret)) {
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
- return EVICT_SHOULD_DEFER_DELETE;
- }
+ if (unlikely(ret))
+ return EVICT_SHOULD_SKIP_DELETE;
if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino))
return EVICT_SHOULD_SKIP_DELETE;
@@ -1363,15 +1360,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
should_delete:
if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
- test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- enum evict_behavior behavior =
- gfs2_upgrade_iopen_glock(inode);
-
- if (behavior != EVICT_SHOULD_DELETE) {
- gfs2_holder_uninit(&ip->i_iopen_gh);
- return behavior;
- }
- }
+ test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+ return gfs2_upgrade_iopen_glock(inode);
return EVICT_SHOULD_DELETE;
}
@@ -1509,7 +1499,7 @@ static void gfs2_evict_inode(struct inode *inode)
gfs2_glock_put(io_gl);
goto out;
}
- behavior = EVICT_SHOULD_DELETE;
+ behavior = EVICT_SHOULD_SKIP_DELETE;
}
if (behavior == EVICT_SHOULD_DELETE)
ret = evict_unlinked_inode(inode);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 8eae8d62a413..26036ffc3f33 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -53,12 +53,20 @@
{(1UL << GLF_DIRTY), "y" }, \
{(1UL << GLF_LFLUSH), "f" }, \
{(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
+ {(1UL << GLF_PENDING_REPLY), "R" }, \
{(1UL << GLF_HAVE_REPLY), "r" }, \
{(1UL << GLF_INITIAL), "a" }, \
{(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \
{(1UL << GLF_LRU), "L" }, \
{(1UL << GLF_OBJECT), "o" }, \
- {(1UL << GLF_BLOCKING), "b" })
+ {(1UL << GLF_BLOCKING), "b" }, \
+ {(1UL << GLF_UNLOCKED), "x" }, \
+ {(1UL << GLF_INSTANTIATE_NEEDED), "n" }, \
+ {(1UL << GLF_INSTANTIATE_IN_PROG), "N" }, \
+ {(1UL << GLF_TRY_TO_EVICT), "e" }, \
+ {(1UL << GLF_VERIFY_DELETE), "E" }, \
+ {(1UL << GLF_DEFER_DELETE), "s" }, \
+ {(1UL << GLF_CANCELING), "C" })
#ifndef NUMPTY
#define NUMPTY
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 192213c7359a..f8ae2c666fd6 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -246,12 +246,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
if (bd == NULL) {
gfs2_log_unlock(sdp);
unlock_buffer(bh);
- lock_page(bh->b_page);
+ folio_lock(bh->b_folio);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh);
else
bd = bh->b_private;
- unlock_page(bh->b_page);
+ folio_unlock(bh->b_folio);
lock_buffer(bh);
gfs2_log_lock(sdp);
}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b75c26045df4..86a6b317b474 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir,
* in a directory, given the inode for the parent directory and the
* name (and its length) of the new directory.
*/
-static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int res;
inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
- return res;
+ return ERR_PTR(res);
}
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
- return 0;
+ return NULL;
}
/*
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f5c4b3e31a1c..876bbb80fb4d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir,
return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
}
-static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
+ return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0));
}
static int hfsplus_rename(struct mnt_idmap *idmap,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e0741e468956..a2c6b9051c5b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -679,17 +679,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino,
return err;
}
-static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
+ struct dentry *dentry, umode_t mode)
{
+ struct inode *inode;
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
err = do_mkdir(file, mode);
+ if (err) {
+ dentry = ERR_PTR(err);
+ } else {
+ inode = hostfs_iget(dentry->d_sb, file);
+ d_drop(dentry);
+ dentry = d_splice_alias(inode, dentry);
+ }
__putname(file);
- return err;
+ return dentry;
}
static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d0edf9ed33b6..e3cdc421dfba 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir)
hpfs_write_inode_nolock(dir);
}
-static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
@@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
int r;
struct hpfs_dirent dee;
int err;
- if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+ if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err);
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
- return 0;
+ return NULL;
bail3:
iput(result);
bail2:
@@ -123,7 +123,7 @@ bail1:
hpfs_free_sectors(dir->i_sb, fno, 1);
bail:
hpfs_unlock(dir->i_sb);
- return err;
+ return ERR_PTR(err);
}
static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0fc179a59830..d98caedbb723 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -991,14 +991,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(idmap, dir, dentry,
mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int hugetlbfs_create(struct mnt_idmap *idmap,
diff --git a/fs/init.c b/fs/init.c
index e9387b6c4f30..eef5124885e3 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode)
return PTR_ERR(dentry);
mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
- if (!error)
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ if (!error) {
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
done_path_create(&path, dentry);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5587aabdaa5e..99318b157a9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head)
free_inode_nonrcu(inode);
}
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * alloc_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ * Inode wont be chained in superblock s_inodes list
+ * This means :
+ * - fs can't be unmount
+ * - quotas, fsnotify, writeback can't work
+ */
+struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
@@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ struct super_block *sb = inode->i_sb;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_add(&inode->i_sb_list, &sb->s_inodes);
+ spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
+ struct super_block *sb = inode->i_sb;
+
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode->i_sb->s_inode_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
}
@@ -806,23 +820,16 @@ static void evict(struct inode *inode)
/*
* Wake up waiters in __wait_on_freeing_inode().
*
- * Lockless hash lookup may end up finding the inode before we removed
- * it above, but only lock it *after* we are done with the wakeup below.
- * In this case the potential waiter cannot safely block.
+ * It is an invariant that any thread we need to wake up is already
+ * accounted for before remove_inode_hash() acquires ->i_lock -- both
+ * sides take the lock and sleep is aborted if the inode is found
+ * unhashed. Thus either the sleeper wins and goes off CPU, or removal
+ * wins and the sleeper aborts after testing with the lock.
*
- * The inode being unhashed after the call to remove_inode_hash() is
- * used as an indicator whether blocking on it is safe.
+ * This also means we don't need any fences for the call below.
*/
- spin_lock(&inode->i_lock);
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
- spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
@@ -900,46 +907,6 @@ again:
}
EXPORT_SYMBOL_GPL(evict_inodes);
-/**
- * invalidate_inodes - attempt to free all inodes on a superblock
- * @sb: superblock to operate on
- *
- * Attempts to free all inodes (including dirty inodes) for a given superblock.
- */
-void invalidate_inodes(struct super_block *sb)
-{
- struct inode *inode, *next;
- LIST_HEAD(dispose);
-
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
-}
-
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
@@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void)
EXPORT_SYMBOL(get_next_ino);
/**
- * new_inode_pseudo - obtain an inode
- * @sb: superblock
- *
- * Allocates a new inode for given superblock.
- * Inode wont be chained in superblock s_inodes list
- * This means :
- * - fs can't be unmount
- * - quotas, fsnotify, writeback can't work
- */
-struct inode *new_inode_pseudo(struct super_block *sb)
-{
- return alloc_inode(sb);
-}
-
-/**
* new_inode - obtain an inode
* @sb: superblock
*
@@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- inode = new_inode_pseudo(sb);
+ inode = alloc_inode(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
@@ -1348,8 +1300,8 @@ again:
}
if (set && unlikely(set(inode, data))) {
- inode = NULL;
- goto unlock;
+ spin_unlock(&inode_hash_lock);
+ return NULL;
}
/*
@@ -1361,14 +1313,14 @@ again:
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_hash_lock);
+
/*
* Add inode to the sb list if it's not already. It has I_NEW at this
* point, so it should be safe to test i_sb_list locklessly.
*/
if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
-unlock:
- spin_unlock(&inode_hash_lock);
return inode;
}
@@ -1497,8 +1449,8 @@ again:
inode->i_state = I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- inode_sb_list_add(inode);
spin_unlock(&inode_hash_lock);
+ inode_sb_list_add(inode);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap,
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
+
+#ifdef CONFIG_DEBUG_VFS
+/*
+ * Dump an inode.
+ *
+ * TODO: add a proper inode dumping routine, this is a stub to get debug off the
+ * ground.
+ */
+void dump_inode(struct inode *inode, const char *reason)
+{
+ pr_warn("%s encountered for inode %px", reason, inode);
+}
+
+EXPORT_SYMBOL(dump_inode);
+#endif
diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..b9b3e29a73fd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file)
}
}
+void fput_close_sync(struct file *);
+void fput_close(struct file *);
+
/*
* super.c
*/
@@ -187,8 +190,8 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
-long do_ftruncate(struct file *file, loff_t length, int small);
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_ftruncate(struct file *file, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
@@ -207,7 +210,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
@@ -325,6 +327,7 @@ struct stashed_operations {
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
* path_mounted - check whether path is mounted
* @path: path to check
@@ -338,3 +341,5 @@ static inline bool path_mounted(const struct path *path)
return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
+bool file_seek_cur_needs_f_lock(struct file *file);
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 638a36be31c1..c91fd2b46a77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
return error;
}
-static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
CLASS(fd, src_file)(srcfd);
loff_t cloned;
@@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
return ret;
}
-static long ioctl_file_clone_range(struct file *file,
- struct file_clone_range __user *argp)
+static int ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..69e8ebb41302 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
+ ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d303e6c8900c..814b7f679486 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
-#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
-#define IOEND_BATCH_SIZE 4096
-
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,8 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
-static struct bio_set iomap_ioend_bioset;
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -366,20 +362,24 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
-static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx, loff_t offset)
+static int iomap_readpage_iter(struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
{
const struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos + offset;
- loff_t length = iomap_length(iter) - offset;
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
- loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
+ int ret;
- if (iomap->type == IOMAP_INLINE)
- return iomap_read_inline_data(iter, folio);
+ if (iomap->type == IOMAP_INLINE) {
+ ret = iomap_read_inline_data(iter, folio);
+ if (ret)
+ return ret;
+ return iomap_iter_advance(iter, &length);
+ }
/* zero post-eof blocks as the page may be mapped */
ifs = ifs_alloc(iter->inode, folio, iter->flags);
@@ -438,25 +438,22 @@ done:
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
- return pos - orig_pos + plen;
+ length = pos - iter->pos + plen;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+static int iomap_read_folio_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- struct folio *folio = ctx->cur_folio;
- size_t offset = offset_in_folio(folio, iter->pos);
- loff_t length = min_t(loff_t, folio_size(folio) - offset,
- iomap_length(iter));
- loff_t done, ret;
-
- for (done = 0; done < length; done += ret) {
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ int ret;
+
+ while (iomap_length(iter)) {
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
@@ -474,7 +471,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -493,15 +490,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
-static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
+static int iomap_readahead_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- loff_t length = iomap_length(iter);
- loff_t done, ret;
+ int ret;
- for (done = 0; done < length; done += ret) {
+ while (iomap_length(iter)) {
if (ctx->cur_folio &&
- offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
+ offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
if (!ctx->cur_folio_in_bio)
folio_unlock(ctx->cur_folio);
ctx->cur_folio = NULL;
@@ -510,12 +506,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
ctx->cur_folio = readahead_folio(ctx->rac);
ctx->cur_folio_in_bio = false;
}
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
/**
@@ -547,7 +543,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.processed = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -603,6 +599,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ if (iter->flags & IOMAP_DONTCACHE)
+ fgp |= FGP_DONTCACHE;
fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
@@ -907,12 +905,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
-static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
- loff_t length = iomap_length(iter);
- loff_t pos = iter->pos;
ssize_t total_written = 0;
- long status = 0;
+ int status = 0;
struct address_space *mapping = iter->inode->i_mapping;
size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -923,7 +919,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- size_t written; /* Bytes have been written */
+ u64 written; /* Bytes have been written */
+ loff_t pos = iter->pos;
bytes = iov_iter_count(i);
retry:
@@ -934,8 +931,8 @@ retry:
if (unlikely(status))
break;
- if (bytes > length)
- bytes = length;
+ if (bytes > iomap_length(iter))
+ bytes = iomap_length(iter);
/*
* Bring in the user page that we'll copy from _first_.
@@ -1006,17 +1003,12 @@ retry:
goto retry;
}
} else {
- pos += written;
total_written += written;
- length -= written;
+ iomap_iter_advance(iter, &written);
}
- } while (iov_iter_count(i) && length);
+ } while (iov_iter_count(i) && iomap_length(iter));
- if (status == -EAGAIN) {
- iov_iter_revert(i, total_written);
- return -EAGAIN;
- }
- return total_written ? total_written : status;
+ return total_written ? 0 : status;
}
ssize_t
@@ -1034,9 +1026,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_write_iter(&iter, i);
+ iter.status = iomap_write_iter(&iter, i);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1270,23 +1264,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
-static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
if (!iomap_want_unshare_iter(iter))
- return length;
+ return iomap_iter_advance(iter, &bytes);
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
@@ -1304,14 +1297,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
cond_resched();
- pos += bytes;
- written += bytes;
- length -= bytes;
-
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length > 0);
- return written;
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
+
+ return status;
}
int
@@ -1331,7 +1324,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_unshare_iter(&iter);
+ iter.status = iomap_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1350,19 +1343,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
@@ -1383,25 +1375,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (WARN_ON_ONCE(!ret))
return -EIO;
- pos += bytes;
- length -= bytes;
- written += bytes;
- } while (length > 0);
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return status;
}
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
+ .private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1424,7 +1417,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1443,17 +1436,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
if (srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN) {
- loff_t proc = iomap_length(&iter);
+ s64 status;
if (range_dirty) {
range_dirty = false;
- proc = iomap_zero_iter_flush_and_stale(&iter);
+ status = iomap_zero_iter_flush_and_stale(&iter);
+ } else {
+ status = iomap_iter_advance_full(&iter);
}
- iter.processed = proc;
+ iter.status = status;
continue;
}
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
}
return ret;
}
@@ -1461,7 +1456,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1469,11 +1464,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
- return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
+ private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
+static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
struct folio *folio)
{
loff_t length = iomap_length(iter);
@@ -1490,14 +1486,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
- return length;
+ return iomap_iter_advance(iter, &length);
}
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
+ .private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@@ -1509,7 +1507,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
+ iter.status = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
@@ -1538,16 +1536,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
- if (error) {
- mapping_set_error(inode->i_mapping, error);
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,116 +1563,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
-/*
- * Ioend completion routine for merged bios. This can only be called from task
- * contexts as merged ioends can be of unbound length. Hence we have to break up
- * the writeback completions into manageable chunks to avoid long scheduler
- * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
- * good batch processing throughput without creating adverse scheduler latency
- * conditions.
- */
-void
-iomap_finish_ioends(struct iomap_ioend *ioend, int error)
-{
- struct list_head tmp;
- u32 completions;
-
- might_sleep();
-
- list_replace_init(&ioend->io_list, &tmp);
- completions = iomap_finish_ioend(ioend, error);
-
- while (!list_empty(&tmp)) {
- if (completions > IOEND_BATCH_SIZE * 8) {
- cond_resched();
- completions = 0;
- }
- ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
- list_del_init(&ioend->io_list);
- completions += iomap_finish_ioend(ioend, error);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_finish_ioends);
-
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
-{
- if (ioend->io_bio.bi_status != next->io_bio.bi_status)
- return false;
- if (next->io_flags & IOMAP_F_BOUNDARY)
- return false;
- if ((ioend->io_flags & IOMAP_F_SHARED) ^
- (next->io_flags & IOMAP_F_SHARED))
- return false;
- if ((ioend->io_type == IOMAP_UNWRITTEN) ^
- (next->io_type == IOMAP_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- /*
- * Do not merge physically discontiguous ioends. The filesystem
- * completion functions will have to iterate the physical
- * discontiguities even if we merge the ioends at a logical level, so
- * we don't gain anything by merging physical discontiguities here.
- *
- * We cannot use bio->bi_iter.bi_sector here as it is modified during
- * submission so does not point to the start sector of the bio at
- * completion.
- */
- if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
- return false;
- return true;
-}
-
-void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
-{
- struct iomap_ioend *next;
-
- INIT_LIST_HEAD(&ioend->io_list);
-
- while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
- io_list))) {
- if (!iomap_ioend_can_merge(ioend, next))
- break;
- list_move_tail(&next->io_list, &ioend->io_list);
- ioend->io_size += next->io_size;
- }
-}
-EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
-
-static int
-iomap_ioend_compare(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
- struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
-
- if (ia->io_offset < ib->io_offset)
- return -1;
- if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
-void
-iomap_sort_ioends(struct list_head *ioend_list)
-{
- list_sort(NULL, ioend_list, iomap_ioend_compare);
-}
-EXPORT_SYMBOL_GPL(iomap_sort_ioends);
-
static void iomap_writepage_end_bio(struct bio *bio)
{
- iomap_finish_ioend(iomap_ioend_from_bio(bio),
- blk_status_to_errno(bio->bi_status));
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
}
/*
- * Submit the final bio for an ioend.
+ * Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1591,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
- if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(wpc->ioend, error);
+ if (wpc->ops->submit_ioend) {
+ error = wpc->ops->submit_ioend(wpc, error);
+ } else {
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+ if (!error)
+ submit_bio(&wpc->ioend->io_bio);
+ }
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
- } else {
- submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
@@ -1709,9 +1610,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos)
+ struct writeback_control *wbc, struct inode *inode, loff_t pos,
+ u16 ioend_flags)
{
- struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1719,36 +1620,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
- wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
-
- ioend = iomap_ioend_from_bio(bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = wpc->iomap.type;
- ioend->io_flags = wpc->iomap.flags;
- if (pos > wpc->iomap.offset)
- wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = pos;
- ioend->io_sector = bio->bi_iter.bi_sector;
-
+ wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
- return ioend;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
{
- if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- return false;
- if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
- (wpc->ioend->io_flags & IOMAP_F_SHARED))
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
- if (wpc->iomap.type != wpc->ioend->io_type)
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (iomap_sector(&wpc->iomap, pos) !=
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
@@ -1779,14 +1668,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -2062,11 +1960,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-
-static int __init iomap_buffered_init(void)
-{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_bio),
- BIOSET_NEED_BVECS);
-}
-fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 0e47da82b0c2..844261a31156 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2021 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
- if (dio->dops && dio->dops->submit_io)
+ if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
- else
+ } else {
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
+ }
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-void iomap_dio_bio_end_io(struct bio *bio)
+/*
+ * Called when dio->ref reaches zero from an I/O completion.
+ */
+static void iomap_dio_done(struct iomap_dio *dio)
{
- struct iomap_dio *dio = bio->bi_private;
- bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
- if (bio->bi_status)
- iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
- if (!atomic_dec_and_test(&dio->ref))
- goto release_bio;
-
- /*
- * Synchronous dio, task itself will handle any completion work
- * that needs after IO. All we need to do is wake the task.
- */
if (dio->wait_for_completion) {
+ /*
+ * Synchronous I/O, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- goto release_bio;
- }
-
- /*
- * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
- */
- if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
- goto release_bio;
- }
-
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
- * our completion that way to avoid an async punt to a workqueue.
- */
- if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
+ * schedule our completion that way to avoid an async punt to a
+ * workqueue.
+ */
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
- goto release_bio;
+ } else {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Async DIO completion that requires filesystem level
+ * completion work gets punted to a work queue to complete as
+ * the operation may require more IO to be issued to finalise
+ * filesystem metadata changes or guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
+}
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+ if (atomic_dec_and_test(&dio->ref))
+ iomap_dio_done(dio);
- /*
- * Async DIO completion that requires filesystem level completion work
- * gets punted to a work queue to complete as the operation may require
- * more IO to be issued to finalise filesystem metadata changes or
- * guarantee data integrity.
- */
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
- &dio->aio.work);
-release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -239,6 +244,47 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+ struct iomap_dio *dio = ioend->io_bio.bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ u32 vec_count = ioend->io_bio.bi_vcnt;
+
+ if (ioend->io_error)
+ iomap_dio_set_error(dio, ioend->io_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ /*
+ * Try to avoid another context switch for the completion given
+ * that we are already called from the ioend completion
+ * workqueue, but never invalidate pages from this thread to
+ * avoid deadlocks with buffered I/O completions. Tough luck if
+ * you hit the tiny race with someone dirtying the range now
+ * between this check and the actual completion.
+ */
+ if (!dio->iocb->ki_filp->f_mapping->nrpages) {
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ iomap_dio_done(dio);
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(&ioend->io_bio);
+ } else {
+ bio_release_pages(&ioend->io_bio, false);
+ bio_put(&ioend->io_bio);
+ }
+
+ /*
+ * Return the number of bvecs completed as even direct I/O completions
+ * do significant per-folio work and we'll still want to give up the
+ * CPU after a lot of completions.
+ */
+ return vec_count;
+}
+
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
@@ -266,81 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
}
/*
- * Figure out the bio's operation flags from the dio request, the
- * mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_THROUGH flag in the dio request.
+ * Use a FUA write if we need datasync semantics and this is a pure data I/O
+ * that doesn't require any metadata updates (including after I/O completion
+ * such as unwritten extent conversion) and the underlying device either
+ * doesn't have a volatile write cache or supports FUA.
+ * This allows us to avoid cache flushes on I/O completion.
*/
-static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua, bool atomic)
+static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
+ struct iomap_dio *dio)
{
- blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
-
- if (!(dio->flags & IOMAP_DIO_WRITE))
- return REQ_OP_READ;
-
- opflags |= REQ_OP_WRITE;
- if (use_fua)
- opflags |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- if (atomic)
- opflags |= REQ_ATOMIC;
-
- return opflags;
+ if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
+ return false;
+ if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
+ return false;
+ return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
}
-static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
const loff_t length = iomap_length(iter);
- bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
- blk_opf_t bio_opf;
+ blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
int nr_pages, ret = 0;
- size_t copied = 0;
+ u64 copied = 0;
size_t orig_count;
- if (atomic && length != fs_block_size)
- return -EINVAL;
-
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_opf |= REQ_OP_WRITE;
+
+ if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
+ /*
+ * Ensure that the mapping covers the full write
+ * length, otherwise it won't be submitted as a single
+ * bio, which is required to use hardware atomics.
+ */
+ if (length != iter->len)
+ return -EINVAL;
+ bio_opf |= REQ_ATOMIC;
+ }
- if (iomap->flags & IOMAP_F_SHARED)
- dio->flags |= IOMAP_DIO_COW;
+ if (iomap->type == IOMAP_UNWRITTEN) {
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ }
+
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+
+ if (iomap->flags & IOMAP_F_NEW) {
+ need_zeroout = true;
+ } else if (iomap->type == IOMAP_MAPPED) {
+ if (iomap_dio_can_use_fua(iomap, dio))
+ bio_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ }
- if (iomap->flags & IOMAP_F_NEW) {
- need_zeroout = true;
- } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this is a pure
- * data IO that doesn't require any metadata updates (including
- * after IO completion such as unwritten extent conversion) and
- * the underlying device either supports FUA or doesn't have
- * a volatile write cache. This allows us to avoid cache flushes
- * on IO completion. If we can't use writethrough and need to
- * sync, disable in-task completions as dio completion will
- * need to call generic_write_sync() which will do a blocking
- * fsync / cache flush call.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional I/O at completion time.
+ *
+ * This rules out writes that need zeroing or extent conversion,
+ * extend the file size, or issue metadata I/O or cache flushes
+ * during completion processing.
*/
- if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
- (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
- use_fua = true;
- else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ if (need_zeroout || (pos >= i_size_read(inode)) ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
+ !(bio_opf & REQ_FUA)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ } else {
+ bio_opf |= REQ_OP_READ;
}
/*
@@ -355,18 +405,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only do deferred completion for pure overwrites that
- * don't require additional IO at completion. This rules out
- * writes that need zeroing or extent conversion, extend
- * the file size, or issue journal IO or cache flushes
- * during completion processing.
- */
- if (need_zeroout ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
- ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
-
- /*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
@@ -383,8 +421,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
-
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
@@ -416,9 +452,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
- if (WARN_ON_ONCE(atomic && n != length)) {
+ if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
/*
- * This bio should have covered the complete length,
+ * An atomic write bio must cover the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
@@ -465,30 +501,28 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return copied;
+ return iomap_iter_advance(iter, &copied);
return ret;
}
-static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
- return length;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
- struct iomap_dio *dio)
+static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- size_t copied;
+ u64 copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
@@ -510,11 +544,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
dio->size += copied;
if (!copied)
return -EFAULT;
- return copied;
+ return iomap_iter_advance(iomi, &copied);
}
-static loff_t iomap_dio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
@@ -608,9 +641,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
- if (iocb->ki_flags & IOCB_ATOMIC)
- iomi.flags |= IOMAP_ATOMIC;
-
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -645,6 +675,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -698,7 +731,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
- iomi.processed = iomap_dio_iter(&iomi, dio);
+ iomi.status = iomap_dio_iter(&iomi, dio);
/*
* We can only poll for single bio I/Os.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 610ca6f1ec9b..80675c42e94e 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
-static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
+static int iomap_fiemap_iter(struct iomap_iter *iter,
struct fiemap_extent_info *fi, struct iomap *prev)
{
int ret;
if (iter->iomap.type == IOMAP_HOLE)
- return iomap_length(iter);
+ goto advance;
ret = iomap_to_fiemap(fi, prev, 0);
*prev = iter->iomap;
- switch (ret) {
- case 0: /* success */
- return iomap_length(iter);
- case 1: /* extent array full */
- return 0;
- default: /* error */
+ if (ret < 0)
return ret;
- }
+ if (ret == 1) /* extent array full */
+ return 0;
+
+advance:
+ return iomap_iter_advance_full(iter);
}
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
@@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
+ iter.status = iomap_fiemap_iter(&iter, fi, &prev);
if (prev.type != IOMAP_HOLE) {
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
@@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
while ((ret = iomap_iter(&iter, ops)) > 0) {
if (iter.iomap.type == IOMAP_MAPPED)
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
- /* leave iter.processed unset to abort loop */
+ /* leave iter.status unset to abort loop */
}
if (ret)
return 0;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..f6992a3bf66a
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+#define IOEND_BATCH_SIZE 4096
+
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..18894ebba6db
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024-2025 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/list_sort.h>
+#include "internal.h"
+
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+ struct bio *bio, loff_t file_offset, u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_parent = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_flags = ioend_flags;
+ ioend->io_inode = inode;
+ ioend->io_offset = file_offset;
+ ioend->io_size = bio->bi_iter.bi_size;
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ ioend->io_private = NULL;
+ return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+ return iomap_finish_ioend_direct(ioend);
+ return iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
+
+ list_replace_init(&ioend->io_list, &tmp);
+ completions = iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ completions += iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
+{
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
+ return false;
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
+ next->io_sector)
+ return false;
+ return true;
+}
+
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+/*
+ * Split up to the first @max_len bytes from @ioend if the ioend covers more
+ * than @max_len bytes.
+ *
+ * If @is_append is set, the split will be based on the hardware limits for
+ * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
+ * limits don't allow the entire @max_len length.
+ *
+ * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
+ * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
+ * switch the operation after this call, but before submitting the bio.
+ */
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append)
+{
+ struct bio *bio = &ioend->io_bio;
+ struct iomap_ioend *split_ioend;
+ unsigned int nr_segs;
+ int sector_offset;
+ struct bio *split;
+
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ max_len = min(max_len,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
+ if (unlikely(sector_offset < 0))
+ return ERR_PTR(sector_offset);
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= max_len)
+ return NULL;
+ sector_offset = max_len >> SECTOR_SHIFT;
+ }
+
+ /* ensure the split ioend is still block size aligned */
+ sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+ i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+ split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+ if (IS_ERR(split))
+ return ERR_CAST(split);
+ split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
+ split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+ ioend->io_flags);
+ split_ioend->io_parent = ioend;
+
+ atomic_inc(&ioend->io_remaining);
+ ioend->io_offset += split_ioend->io_size;
+ ioend->io_size -= split_ioend->io_size;
+
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+ return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
+
+static int __init iomap_ioend_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_ioend_init);
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 3790918646af..6ffc6a7b9ba5 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,40 +7,25 @@
#include <linux/iomap.h>
#include "trace.h"
-/*
- * Advance to the next range we need to map.
- *
- * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
- * processed - it was aborted because the extent the iomap spanned may have been
- * changed during the operation. In this case, the iteration behaviour is to
- * remap the unprocessed range of the iter, and that means we may need to remap
- * even when we've made no progress (i.e. iter->processed = 0). Hence the
- * "finished iterating" case needs to distinguish between
- * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
- * need to remap the entire remaining range.
- */
-static inline int iomap_iter_advance(struct iomap_iter *iter)
+static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
- bool stale = iter->iomap.flags & IOMAP_F_STALE;
- int ret = 1;
-
- /* handle the previous iteration (if any) */
- if (iter->iomap.length) {
- if (iter->processed < 0)
- return iter->processed;
- if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
- return -EIO;
- iter->pos += iter->processed;
- iter->len -= iter->processed;
- if (!iter->len || (!iter->processed && !stale))
- ret = 0;
- }
-
- /* clear the per iteration state */
- iter->processed = 0;
+ iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
- return ret;
+}
+
+/*
+ * Advance the current iterator position and output the length remaining for the
+ * current mapping.
+ */
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+{
+ if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ return -EIO;
+ iter->pos += *count;
+ iter->len -= *count;
+ *count = iomap_length(iter);
+ return 0;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
@@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+ iter->iter_start_pos = iter->pos;
+
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
@@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
- * of the loop body: leave @iter.processed unchanged, or set it to a negative
+ * of the loop body: leave @iter.status unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+ ssize_t advanced;
+ u64 olen;
int ret;
- if (iter->iomap.length && ops->iomap_end) {
- ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
- iter->processed > 0 ? iter->processed : 0,
- iter->flags, &iter->iomap);
- if (ret < 0 && !iter->processed)
+ trace_iomap_iter(iter, ops, _RET_IP_);
+
+ if (!iter->iomap.length)
+ goto begin;
+
+ /*
+ * Calculate how far the iter was advanced and the original length bytes
+ * for ->iomap_end().
+ */
+ advanced = iter->pos - iter->iter_start_pos;
+ olen = iter->len + advanced;
+
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
+ iomap_length_trim(iter, iter->iter_start_pos,
+ olen),
+ advanced, iter->flags, &iter->iomap);
+ if (ret < 0 && !advanced)
return ret;
}
- trace_iomap_iter(iter, ops, _RET_IP_);
- ret = iomap_iter_advance(iter);
+ /* detect old return semantics where this would advance */
+ if (WARN_ON_ONCE(iter->status > 0))
+ iter->status = -EIO;
+
+ /*
+ * Use iter->len to determine whether to continue onto the next mapping.
+ * Explicitly terminate on error status or if the current iter has not
+ * advanced at all (i.e. no work was done for some reason) unless the
+ * mapping has been marked stale and needs to be reprocessed.
+ */
+ if (iter->status < 0)
+ ret = iter->status;
+ else if (iter->len == 0 || (!advanced && !stale))
+ ret = 0;
+ else
+ ret = 1;
+ iomap_iter_reset_iomap(iter);
if (ret <= 0)
return ret;
+begin:
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index a845c012b50c..04d7919636c1 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
-static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
+static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
@@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return length;
+ return iomap_iter_advance(iter, &length);
}
}
@@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_hole_iter(&iter, &pos);
+ iter.status = iomap_seek_hole_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found hole before EOF */
@@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
-static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
+static int iomap_seek_data_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return length;
+ return iomap_iter_advance(iter, &length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
default:
*hole_pos = iter->pos;
@@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_data_iter(&iter, &pos);
+ iter.status = iomap_seek_data_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found data before EOF */
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b90d0eda9e51..c1a762c10ce4 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
-static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
+static int iomap_swapfile_iter(struct iomap_iter *iter,
struct iomap *iomap, struct iomap_swapfile_info *isi)
{
switch (iomap->type) {
@@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
- return iomap_length(iter);
+
+ return iomap_iter_advance_full(iter);
}
/*
@@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
+ iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
if (ret < 0)
return ret;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 4118a42cdab0..9eab2c8ac3c5 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
- __field(s64, processed)
+ __field(int, status)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
- __entry->processed = iter->processed;
+ __entry->status = iter->status;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
- __entry->processed,
+ __entry->status,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e8e80761ac73..1c7c49356878 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
- * Called under lock_journal(), and possibly under journal_datalist_lock. The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under j_list_lock. The caller provided us with a ref against the
+ * buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
@@ -738,10 +738,8 @@ start_journal_io:
err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) {
printk(KERN_WARNING
- "JBD2: Detected IO errors while flushing file data "
- "on %s\n", journal->j_devname);
- if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
- jbd2_journal_abort(journal, err);
+ "JBD2: Detected IO errors %d while flushing file data on %s\n",
+ err, journal->j_devname);
err = 0;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d8084b31b361..a5ccba25ff47 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
- transaction_t *commit_trans;
+ transaction_t *commit_trans, *running_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
@@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
+ running_trans = journal->j_running_transaction;
+ /*
+ * The query transaction hasn't started committing,
+ * it must still be running.
+ */
+ if (WARN_ON_ONCE(!running_trans ||
+ running_trans->t_tid != tid))
+ goto out;
+
+ running_trans->t_need_data_flush = 1;
ret = 1;
goto out;
}
@@ -947,7 +957,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
- * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * the buffer's contents they really should run flush_dcache_folio(bh->b_folio).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
@@ -1361,7 +1371,7 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
- if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ if (jbd2_journal_has_csum_v2or3(journal) &&
jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
@@ -1369,7 +1379,7 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
- if (jbd2_journal_has_csum_v2or3_feature(journal)) {
+ if (jbd2_journal_has_csum_v2or3(journal)) {
if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
printk(KERN_ERR "JBD2: Unknown checksum type\n");
return err;
@@ -1869,7 +1879,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
- WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
@@ -1965,17 +1974,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
return err;
}
- if (block_start == ~0ULL) {
- block_start = phys_block;
- block_stop = block_start - 1;
- }
+ if (block_start == ~0ULL)
+ block_stop = block_start = phys_block;
/*
* last block not contiguous with current block,
* process last contiguous region and return to this block on
* next loop
*/
- if (phys_block != block_stop + 1) {
+ if (phys_block != block_stop) {
block--;
} else {
block_stop++;
@@ -1994,11 +2001,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
*/
byte_start = block_start * journal->j_blocksize;
byte_stop = block_stop * journal->j_blocksize;
- byte_count = (block_stop - block_start + 1) *
- journal->j_blocksize;
+ byte_count = (block_stop - block_start) * journal->j_blocksize;
truncate_inode_pages_range(journal->j_dev->bd_mapping,
- byte_start, byte_stop);
+ byte_start, byte_stop - 1);
if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
err = blkdev_issue_discard(journal->j_dev,
@@ -2013,7 +2019,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
}
if (unlikely(err != 0)) {
- pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+ pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)",
err, block_start, block_stop);
return err;
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9192be7c19d8..c271a050b7e6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -39,7 +39,7 @@ struct recovery_info
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
+static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
@@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n)
*/
#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static void do_readahead(journal_t *journal, unsigned int start)
{
- int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
@@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
nbufs = 0;
for (next = start; next < max; next++) {
- err = jbd2_journal_bmap(journal, next, &blocknr);
+ int err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
@@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- err = -ENOMEM;
+ if (!bh)
goto failed;
- }
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
@@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (nbufs)
bh_readahead_batch(nbufs, bufs, 0);
- err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
- return err;
}
#endif /* __KERNEL__ */
@@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal,
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
- journal_superblock_t * sb;
-
struct recovery_info info;
memset(&info, 0, sizeof(info));
- sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
- * unmounted.
+ * unmounted. We use its in-memory version j_tail here because
+ * jbd2_journal_wipe() could have updated it without updating journal
+ * superblock.
*/
- if (!sb->s_start) {
+ if (!journal->j_tail) {
+ journal_superblock_t *sb = journal->j_superblock;
+
jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n",
be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
@@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal)
journal->j_transaction_sequence, journal->j_head);
jbd2_journal_clear_revoke(journal);
+ /* Free revoke table allocated for replay */
+ if (journal->j_revoke != journal->j_revoke_table[0] &&
+ journal->j_revoke != journal->j_revoke_table[1]) {
+ jbd2_journal_destroy_revoke_table(journal->j_revoke);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
@@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal,
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
+ else if (pass == PASS_REVOKE) {
+ /*
+ * Would the default revoke table have too long hash chains
+ * during replay?
+ */
+ if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
+ unsigned int hash_size;
+
+ /*
+ * Aim for average chain length of 8, limit at 1M
+ * entries to avoid problems with malicious
+ * filesystems.
+ */
+ hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
+ 1U << 20);
+ journal->j_revoke =
+ jbd2_journal_init_revoke_table(hash_size);
+ if (!journal->j_revoke) {
+ printk(KERN_ERR
+ "JBD2: failed to allocate revoke table for replay with %u entries. "
+ "Journal replay may be slow.\n", hash_size);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
+ }
+ }
jbd2_debug(1, "Starting recovery pass %d\n", pass);
@@ -852,6 +879,13 @@ chksum_ok:
case JBD2_REVOKE_BLOCK:
/*
+ * If we aren't in the SCAN or REVOKE pass, then we can
+ * just skip over this block.
+ */
+ if (pass != PASS_REVOKE && pass != PASS_SCAN)
+ continue;
+
+ /*
* Check revoke block crc in pass_scan, if csum verify
* failed, check commit block time later.
*/
@@ -863,12 +897,7 @@ chksum_ok:
need_check_commit_time = true;
}
- /* If we aren't in the REVOKE pass, then we can
- * just skip over this block. */
- if (pass != PASS_REVOKE)
- continue;
-
- err = scan_revoke_records(journal, bh,
+ err = scan_revoke_records(journal, pass, bh,
next_commit_ID, info);
if (err)
goto failed;
@@ -922,8 +951,9 @@ chksum_ok:
/* Scan a revoke record, marking all blocks mentioned as revoked. */
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
- tid_t sequence, struct recovery_info *info)
+static int scan_revoke_records(journal_t *journal, enum passtype pass,
+ struct buffer_head *bh, tid_t sequence,
+ struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
@@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
if (jbd2_has_feature_64bit(journal))
record_len = 8;
+ if (pass == PASS_SCAN) {
+ info->nr_revokes += (max - offset) / record_len;
+ return 0;
+ }
+
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
@@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
- ++info->nr_revokes;
}
return 0;
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index ce63d5fde9c3..0cf0fddbee81 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void)
return 0;
}
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
@@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
- kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
+ kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
@@ -245,7 +245,7 @@ out:
return table;
}
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
@@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
J_ASSERT(list_empty(hash_list));
}
- kfree(table->hash_table);
+ kvfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
@@ -420,12 +420,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
-int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
- int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
@@ -450,7 +449,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
- did_revoke = 1;
}
}
@@ -473,11 +471,10 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
__brelse(bh2);
}
}
- return did_revoke;
}
/*
- * journal_clear_revoked_flag clears revoked flag of buffers in
+ * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffers in the next
* transaction which is going to be started.
*/
@@ -506,9 +503,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
}
}
-/* journal_switch_revoke table select j_revoke for next transaction
- * we do not want to suspend any processing until all revokes are
- * written -bzzz
+/* jbd2_journal_switch_revoke_table table select j_revoke for next
+ * transaction we do not want to suspend any processing until all
+ * revokes are written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 66513c18ca29..cbc4785462f5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal,
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
- INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
@@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal,
*/
/*
- * Update transaction's maximum wait time, if debugging is enabled.
- *
* t_max_wait is carefully updated here with use of atomic compare exchange.
* Note that there could be multiplre threads trying to do this simultaneously
* hence using cmpxchg to avoid any use of locks in this case.
- * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
@@ -2079,21 +2075,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
jh->b_transaction = NULL;
}
-void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- spin_lock(&jh->b_state_lock);
- spin_lock(&journal->j_list_lock);
- __jbd2_journal_unfile_buffer(jh);
- spin_unlock(&journal->j_list_lock);
- spin_unlock(&jh->b_state_lock);
- jbd2_journal_put_journal_head(jh);
- __brelse(bh);
-}
-
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
@@ -2192,7 +2173,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
/*
* We don't want to write the buffer anymore, clear the
* bit so that we don't confuse checks in
- * __journal_file_buffer
+ * __jbd2_journal_file_buffer
*/
clear_buffer_dirty(bh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 2b2938970da3..dd91f725ded6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
static int jffs2_unlink (struct inode *,struct dentry *);
static int jffs2_symlink (struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
-static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
- umode_t);
+static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
+ umode_t);
static int jffs2_rmdir (struct inode *,struct dentry *);
static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *,
umode_t,dev_t);
@@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
}
-static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
+ struct dentry *dentry, umode_t mode)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
@@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
ri = jffs2_alloc_raw_inode();
if (!ri)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
c = JFFS2_SB_INFO(dir_i->i_sb);
@@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (ret) {
jffs2_free_raw_inode(ri);
- return ret;
+ return ERR_PTR(ret);
}
inode = jffs2_new_inode(dir_i, mode, ri);
@@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (IS_ERR(inode)) {
jffs2_free_raw_inode(ri);
jffs2_complete_reservation(c);
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
}
inode->i_op = &jffs2_dir_inode_operations;
@@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
jffs2_complete_reservation(c);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
fail:
iget_failed(inode);
- return ret;
+ return ERR_PTR(ret);
}
static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 07cfdc440596..60fc92dee24d 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -369,7 +369,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
ASSERT(length >= 0);
- if (test_cflag(COMMIT_Nolink, ip)) {
+ if (test_cflag(COMMIT_Nolink, ip) || isReadOnly(ip)) {
xtTruncate(0, ip, length, COMMIT_WMAP);
return;
}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index f9009e4f9ffd..26e89d0c69b6 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,41 +178,26 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
-
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
- if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
- bmp->db_l2nbperpage < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag || bmp->db_numag > MAXAG) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
- if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
- bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
- if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
- bmp->db_agl2size < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
- if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) ||
+ (bmp->db_l2nbperpage < 0) ||
+ !bmp->db_numag || (bmp->db_numag > MAXAG) ||
+ (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) ||
+ (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) ||
+ !bmp->db_agwidth ||
+ (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) ||
+ (bmp->db_agl2size < 0) ||
+ ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
@@ -3403,7 +3388,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
oldl2agsize = bmp->db_agl2size;
bmp->db_agl2size = l2agsize;
- bmp->db_agsize = 1 << l2agsize;
+ bmp->db_agsize = (s64)1 << l2agsize;
/* compute new number of AG */
agno = bmp->db_numag;
@@ -3666,8 +3651,8 @@ void dbFinalizeBmap(struct inode *ipbmap)
* system size is not a multiple of the group size).
*/
inactfree = (inactags && ag_rem) ?
- ((inactags - 1) << bmp->db_agl2size) + ag_rem
- : inactags << bmp->db_agl2size;
+ (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem
+ : ((s64)inactags << bmp->db_agl2size);
/* determine how many free blocks are in the active
* allocation groups plus the average number of free blocks
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 8f85177f284b..93db6eec4465 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -117,7 +117,8 @@ do { \
if (!(RC)) { \
if (((P)->header.nextindex > \
(((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \
+ ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \
BT_PUTPAGE(MP); \
jfs_error((IP)->i_sb, \
"DT_GETPAGE: dtree page corrupt\n"); \
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 63d21822d309..46529bcc8297 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -74,6 +74,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
int rc;
int xflag;
+ if (isReadOnly(ip)) {
+ jfs_error(ip->i_sb, "read-only filesystem\n");
+ return -EIO;
+ }
+
/* This blocks if we are low on resources */
txBeginAnon(ip->i_sb);
@@ -253,6 +258,11 @@ int extRecord(struct inode *ip, xad_t * xp)
{
int rc;
+ if (isReadOnly(ip)) {
+ jfs_error(ip->i_sb, "read-only filesystem\n");
+ return -EIO;
+ }
+
txBeginAnon(ip->i_sb);
mutex_lock(&JFS_IP(ip)->commit_mutex);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index a360b24ed320..ecb8e05b8b84 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -102,7 +102,7 @@ int diMount(struct inode *ipimap)
* allocate/initialize the in-memory inode map control structure
*/
/* allocate the in-memory inode map control structure. */
- imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
+ imap = kzalloc(sizeof(struct inomap), GFP_KERNEL);
if (imap == NULL)
return -ENOMEM;
@@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
dp += inum % 8; /* 8 inodes per 4K page */
/* copy on-disk inode to in-memory inode */
- if ((copy_from_dinode(dp, ip)) != 0) {
+ if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) {
/* handle bad return by returning NULL for ip */
set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
@@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
*
* RETURN VALUES:
* 0 - success
- * -ENOMEM - insufficient memory
+ * -EINVAL - unexpected inode type
*/
static int copy_from_dinode(struct dinode * dip, struct inode *ip)
{
struct jfs_inode_info *jfs_ip = JFS_IP(ip);
struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int fileset = le32_to_cpu(dip->di_fileset);
+
+ switch (fileset) {
+ case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I:
+ case LOG_I: case BADBLOCK_I: case FILESYSTEM_I:
+ break;
+ default:
+ return -EINVAL;
+ }
- jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+ jfs_ip->fileset = fileset;
jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
jfs_set_inode_flags(ip);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index fc8ede43afde..65a218eba8fa 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
* dentry - dentry of child directory
* mode - create mode (rwxrwxrwx).
*
- * RETURN: Errors from subroutines
+ * RETURN: ERR_PTR() of errors from subroutines.
*
* note:
* EACCES: user needs search+write permission on the parent directory
*/
-static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
{
int rc = 0;
tid_t tid; /* transaction id */
@@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
out1:
jfs_info("jfs_mkdir: rc:%d", rc);
- return rc;
+ return ERR_PTR(rc);
}
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 223d9ac59839..10368c188c5e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -389,8 +389,8 @@ static int jfs_reconfigure(struct fs_context *fc)
if (!ctx->newLVSize) {
ctx->newLVSize = sb_bdev_nr_blocks(sb);
- if (ctx->newLVSize == 0)
- pr_err("JFS: Cannot determine volume size\n");
+ if (ctx->newLVSize == 0)
+ pr_err("JFS: Cannot determine volume size\n");
}
rc = jfs_extendfs(sb, ctx->newLVSize, 0);
@@ -766,7 +766,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24afbae87225..11d7f74d207b 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
- int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
-
- printk(KERN_ERR "ea_get: invalid extended attribute\n");
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
- ea_buf->xattr, size, 1);
+ if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) {
+ printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n",
+ EALIST_SIZE(ea_buf->xattr));
+ } else {
+ int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
+
+ printk(KERN_ERR "ea_get: invalid extended attribute\n");
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+ ea_buf->xattr, size, 1);
+ }
ea_release(inode, ea_buf);
rc = -EIO;
goto clean_up;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5f0f8b95f44c..d296aad70800 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1230,24 +1230,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
return d_splice_alias(inode, dentry);
}
-static int kernfs_iop_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
int ret;
if (!scops || !scops->mkdir)
- return -EPERM;
+ return ERR_PTR(-EPERM);
if (!kernfs_get_active(parent))
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
ret = scops->mkdir(parent, dentry->d_name.name, mode);
kernfs_put_active(parent);
- return ret;
+ return ERR_PTR(ret);
}
static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/libfs.c b/fs/libfs.c
index dc042a975a56..6393d7c49ee6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
}
EXPORT_SYMBOL(simple_inode_init_ts);
-static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
+struct dentry *stashed_dentry_get(struct dentry **stashed)
{
struct dentry *dentry;
@@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = get_stashed_dentry(stashed);
+ path->dentry = stashed_dentry_get(stashed);
if (path->dentry) {
sops->put_data(data);
goto out_path;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 5d9c1406fe27..8938536d8d3c 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
return add_nondir(dentry, inode);
}
-static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode * inode;
int err;
inode = minix_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode_inc_link_count(dir);
minix_set_inode(inode, 0);
@@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 7b1df8cc2821..a37991fdb194 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -6,6 +6,7 @@
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/seq_file.h>
#include "internal.h"
@@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
+
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map)
+{
+ struct uid_gid_map *map, *map_up;
+ u32 idx, nr_mappings;
+
+ if (!is_valid_mnt_idmap(idmap))
+ return 0;
+
+ /*
+ * Idmappings are shown relative to the caller's idmapping.
+ * This is both the most intuitive and most useful solution.
+ */
+ if (uid_map) {
+ map = &idmap->uid_map;
+ map_up = &current_user_ns()->uid_map;
+ } else {
+ map = &idmap->gid_map;
+ map_up = &current_user_ns()->gid_map;
+ }
+
+ for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) {
+ uid_t lower;
+ struct uid_gid_extent *extent;
+
+ if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = &map->extent[idx];
+ else
+ extent = &map->forward[idx];
+
+ /*
+ * Verify that the whole range of the mapping can be
+ * resolved in the caller's idmapping. If it cannot be
+ * resolved skip the mapping.
+ */
+ lower = map_id_range_up(map_up, extent->lower_first, extent->count);
+ if (lower == (uid_t) -1)
+ continue;
+
+ seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);
+
+ seq->count++; /* mappings are separated by \0 */
+ if (seq_has_overflowed(seq))
+ return -EAGAIN;
+
+ nr_mappings++;
+ }
+
+ return nr_mappings;
+}
diff --git a/fs/mount.h b/fs/mount.h
index ffb613cdfeee..7aecf2a60472 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,12 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+extern struct list_head notify_list;
+
+typedef __u32 __bitwise mntns_flags_t;
+
+#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0))
+
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
@@ -20,12 +26,18 @@ struct mnt_namespace {
wait_queue_head_t poll;
struct rcu_head mnt_ns_rcu;
};
+ u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
+#ifdef CONFIG_FSNOTIFY
+ __u32 n_fsnotify_mask;
+ struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
+#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
+ mntns_flags_t mntns_flags;
} __randomize_layout;
struct mnt_pcp {
@@ -76,6 +88,8 @@ struct mount {
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
+ struct list_head to_notify; /* need to queue notification */
+ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */
#endif
int mnt_id; /* mount identifier, reused */
u64 mnt_id_unique; /* mount ID unique until reboot */
@@ -156,6 +170,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt)
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
+static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
+{
+ return RB_EMPTY_ROOT(&ns->mounts);
+}
+
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
struct mnt_namespace *ns = mnt->mnt_ns;
@@ -177,3 +196,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
+
+#ifdef CONFIG_FSNOTIFY
+static inline void mnt_notify_add(struct mount *m)
+{
+ /* Optimize the case where there are no watches */
+ if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
+ (m->prev_ns && m->prev_ns->n_fsnotify_marks))
+ list_add_tail(&m->to_notify, &notify_list);
+ else
+ m->prev_ns = m->mnt_ns;
+}
+#else
+static inline void mnt_notify_add(struct mount *m)
+{
+}
+#endif
+
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 82aecf372743..ad7844de87c3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
* don't make any buffers if there is only one buffer on
* the folio and the folio just needs to be set up to date
*/
- if (inode->i_blkbits == PAGE_SHIFT &&
+ if (inode->i_blkbits == folio_shift(folio) &&
buffer_uptodate(bh)) {
folio_mark_uptodate(folio);
return;
@@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct folio *folio = args->folio;
struct inode *inode = folio->mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
@@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
sector_t last_block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_hole = blocks_per_page;
+ unsigned first_hole = blocks_per_folio;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
@@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
unsigned relative_block;
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- /* MAX_BUF_PER_PAGE, for example */
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (args->is_readahead) {
opf |= REQ_RAHEAD;
gfp |= __GFP_NORETRY | __GFP_NOWARN;
@@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + args->nr_pages * blocks_per_page;
+ block_in_file = folio_pos(folio) >> blkbits;
+ last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
clear_buffer_mapped(map_bh);
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* Then do more get_blocks calls until we are done with this folio.
*/
map_bh->b_folio = folio;
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
@@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
goto confused;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
bdev = map_bh->b_bdev;
}
- if (first_hole != blocks_per_page) {
- folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
+ if (first_hole != blocks_per_folio) {
+ folio_zero_segment(folio, first_hole << blkbits, folio_size(folio));
if (first_hole == 0) {
folio_mark_uptodate(folio);
folio_unlock(folio);
@@ -303,10 +300,10 @@ alloc_new:
relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
- (first_hole != blocks_per_page))
+ (first_hole != blocks_per_folio))
args->bio = mpage_bio_submit_read(args->bio);
else
- args->last_block_in_bio = first_block + blocks_per_page - 1;
+ args->last_block_in_bio = first_block + blocks_per_folio - 1;
out:
return args->bio;
@@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
.folio = folio,
- .nr_pages = 1,
+ .nr_pages = folio_nr_pages(folio),
.get_block = get_block,
};
@@ -456,12 +453,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_unmapped = blocks_per_page;
+ unsigned first_unmapped = blocks_per_folio;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
@@ -486,12 +483,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
*/
if (buffer_dirty(bh))
goto confused;
- if (first_unmapped == blocks_per_page)
+ if (first_unmapped == blocks_per_folio)
first_unmapped = page_block;
continue;
}
- if (first_unmapped != blocks_per_page)
+ if (first_unmapped != blocks_per_folio)
goto confused; /* hole -> non-hole */
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
@@ -527,7 +524,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
* The page has no buffers: map it to disk
*/
BUG_ON(!folio_test_uptodate(folio));
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
+ block_in_file = folio_pos(folio) >> blkbits;
/*
* Whole page beyond EOF? Skip allocating blocks to avoid leaking
* space.
@@ -536,7 +533,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
goto page_is_mapped;
last_block = (i_size - 1) >> blkbits;
map_bh.b_folio = folio;
- for (page_block = 0; page_block < blocks_per_page; ) {
+ for (page_block = 0; page_block < blocks_per_folio; ) {
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
@@ -618,14 +615,14 @@ alloc_new:
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
folio_unlock(folio);
- if (boundary || (first_unmapped != blocks_per_page)) {
+ if (boundary || (first_unmapped != blocks_per_folio)) {
bio = mpage_bio_submit_write(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
- mpd->last_block_in_bio = first_block + blocks_per_page - 1;
+ mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
}
goto out;
diff --git a/fs/namei.c b/fs/namei.c
index ecb7b95c2ca3..360a86ca1f02 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,6 +125,13 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
+static inline void initname(struct filename *name)
+{
+ name->uptr = NULL;
+ name->aname = NULL;
+ atomic_set(&name->refcnt, 1);
+}
+
struct filename *
getname_flags(const char __user *filename, int flags)
{
@@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
-
- atomic_set(&result->refcnt, 1);
- result->uptr = filename;
- result->aname = NULL;
+ initname(result);
audit_getname(result);
return result;
}
@@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags)
return getname_flags(filename, flags);
}
-struct filename *getname(const char __user * filename)
-{
- return getname_flags(filename, 0);
-}
-
struct filename *__getname_maybe_null(const char __user *pathname)
{
struct filename *name;
@@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- result->uptr = NULL;
- result->aname = NULL;
- atomic_set(&result->refcnt, 1);
+ initname(result);
audit_getname(result);
-
return result;
}
EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
+ int refcnt;
+
if (IS_ERR_OR_NULL(name))
return;
- if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
- return;
+ refcnt = atomic_read(&name->refcnt);
+ if (refcnt != 1) {
+ if (WARN_ON_ONCE(!refcnt))
+ return;
- if (!atomic_dec_and_test(&name->refcnt))
- return;
+ if (!atomic_dec_and_test(&name->refcnt))
+ return;
+ }
if (name->name != name->iname) {
__putname(name->name);
@@ -1670,6 +1671,8 @@ static struct dentry *lookup_dcache(const struct qstr *name,
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
+ * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
+ * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
*/
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
struct dentry *base,
@@ -1680,7 +1683,7 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
struct inode *dir = base->d_inode;
if (dentry)
- return dentry;
+ goto found;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir)))
@@ -1695,6 +1698,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
dput(dentry);
dentry = old;
}
+found:
+ if (IS_ERR(dentry))
+ return dentry;
+ if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
+ dput(dentry);
+ return ERR_PTR(-ENOENT);
+ }
+ if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
+ dput(dentry);
+ return ERR_PTR(-EEXIST);
+ }
return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
@@ -2863,15 +2877,14 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
- * The caller must hold base->i_mutex.
+ * No locks need be held - only a counted reference to @base is needed.
+ *
*/
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
int err;
- WARN_ON_ONCE(!inode_is_locked(base->d_inode));
-
err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -3415,6 +3428,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
if ((acc_mode & MAY_EXEC) && path_noexec(path))
return -EACCES;
break;
+ default:
+ VFS_BUG_ON_INODE(1, inode);
}
error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
@@ -3995,7 +4010,7 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ fput_close(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4078,27 +4093,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* '/', and a directory wasn't requested.
*/
if (last.name[last.len] && !want_dir)
- create_flags = 0;
+ create_flags &= ~LOOKUP_CREATE;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = lookup_one_qstr_excl(&last, path->dentry,
reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
- error = -EEXIST;
- if (d_is_positive(dentry))
- goto fail;
-
- /*
- * Special case - lookup gave negative, but... we had foo/bar/
- * From the vfs_mknod() POV we just have a negative dentry -
- * all is fine. Let's be bastards - you had / on the end, you've
- * been asking for (non-existent) directory. -ENOENT for you.
- */
- if (unlikely(!create_flags)) {
- error = -ENOENT;
- goto fail;
- }
if (unlikely(err2)) {
error = err2;
goto fail;
@@ -4129,7 +4130,8 @@ EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
- dput(dentry);
+ if (!IS_ERR(dentry))
+ dput(dentry);
inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
@@ -4275,7 +4277,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
}
/**
- * vfs_mkdir - create directory
+ * vfs_mkdir - create directory returning correct dentry if possible
* @idmap: idmap of the mount the inode was found from
* @dir: inode of the parent directory
* @dentry: dentry of the child directory
@@ -4288,32 +4290,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
+ *
+ * In the event that the filesystem does not use the *@dentry but leaves it
+ * negative or unhashes it and possibly splices a different one returning it,
+ * the original dentry is dput() and the alternate is returned.
+ *
+ * In case of an error the dentry is dput() and an ERR_PTR() is returned.
*/
-int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
unsigned max_links = dir->i_sb->s_max_links;
+ struct dentry *de;
error = may_create(idmap, dir, dentry);
if (error)
- return error;
+ goto err;
+ error = -EPERM;
if (!dir->i_op->mkdir)
- return -EPERM;
+ goto err;
mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
- return error;
+ goto err;
+ error = -EMLINK;
if (max_links && dir->i_nlink >= max_links)
- return -EMLINK;
+ goto err;
- error = dir->i_op->mkdir(idmap, dir, dentry, mode);
- if (!error)
- fsnotify_mkdir(dir, dentry);
- return error;
+ de = dir->i_op->mkdir(idmap, dir, dentry, mode);
+ error = PTR_ERR(de);
+ if (IS_ERR(de))
+ goto err;
+ if (de) {
+ dput(dentry);
+ dentry = de;
+ }
+ fsnotify_mkdir(dir, dentry);
+ return dentry;
+
+err:
+ dput(dentry);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);
@@ -4333,8 +4354,10 @@ retry:
error = security_path_mkdir(&path, dentry,
mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
}
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
@@ -4445,10 +4468,6 @@ retry:
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
- if (!dentry->d_inode) {
- error = -ENOENT;
- goto exit4;
- }
error = security_path_rmdir(&path, dentry);
if (error)
goto exit4;
@@ -4579,7 +4598,7 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len] || d_is_negative(dentry))
+ if (last.name[last.len])
goto slashes;
inode = dentry->d_inode;
ihold(inode);
@@ -4613,9 +4632,7 @@ exit1:
return error;
slashes:
- if (d_is_negative(dentry))
- error = -ENOENT;
- else if (d_is_dir(dentry))
+ if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
@@ -5115,7 +5132,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
- unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
+ unsigned int lookup_flags = 0, target_flags =
+ LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
bool should_retry = false;
int error = -EINVAL;
@@ -5128,6 +5146,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
if (flags & RENAME_EXCHANGE)
target_flags = 0;
+ if (flags & RENAME_NOREPLACE)
+ target_flags |= LOOKUP_EXCL;
retry:
error = filename_parentat(olddfd, from, lookup_flags, &old_path,
@@ -5169,23 +5189,12 @@ retry_deleg:
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
- /* source must exist */
- error = -ENOENT;
- if (d_is_negative(old_dentry))
- goto exit4;
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
- error = -EEXIST;
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
- goto exit5;
if (flags & RENAME_EXCHANGE) {
- error = -ENOENT;
- if (d_is_negative(new_dentry))
- goto exit5;
-
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
diff --git a/fs/namespace.c b/fs/namespace.c
index 8f1000f9f3df..6100e5b962a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -81,15 +81,23 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+#ifdef CONFIG_FSNOTIFY
+LIST_HEAD(notify_list); /* protected by namespace_sem */
+#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
+enum mount_kattr_flags_t {
+ MOUNT_KATTR_RECURSE = (1 << 0),
+ MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
+};
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
- bool recurse;
+ enum mount_kattr_flags_t kflags;
struct user_namespace *mnt_userns;
struct mnt_idmap *mnt_idmap;
};
@@ -163,6 +171,7 @@ static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
+ fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
}
@@ -998,6 +1007,17 @@ static inline int check_mnt(struct mount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+static inline bool check_anonymous_mnt(struct mount *mnt)
+{
+ u64 seq;
+
+ if (!is_anon_ns(mnt->mnt_ns))
+ return false;
+
+ seq = mnt->mnt_ns->seq_origin;
+ return !seq || (seq == current->nsproxy->mnt_ns->seq);
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1176,6 +1196,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
+
+ mnt_notify_add(mnt);
}
/*
@@ -1723,6 +1745,50 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
+#ifdef CONFIG_FSNOTIFY
+static void mnt_notify(struct mount *p)
+{
+ if (!p->prev_ns && p->mnt_ns) {
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ } else if (p->prev_ns && !p->mnt_ns) {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ } else if (p->prev_ns == p->mnt_ns) {
+ fsnotify_mnt_move(p->mnt_ns, &p->mnt);
+ } else {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ }
+ p->prev_ns = p->mnt_ns;
+}
+
+static void notify_mnt_list(void)
+{
+ struct mount *m, *tmp;
+ /*
+ * Notify about mounts that were added/reparented/detached/remain
+ * connected after unmount.
+ */
+ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
+ mnt_notify(m);
+ list_del_init(&m->to_notify);
+ }
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return !list_empty(&notify_list);
+}
+#else
+static void notify_mnt_list(void)
+{
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return false;
+}
+#endif
+
static void namespace_unlock(void)
{
struct hlist_head head;
@@ -1733,7 +1799,18 @@ static void namespace_unlock(void)
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
- up_write(&namespace_sem);
+ if (need_notify_mnt_list()) {
+ /*
+ * No point blocking out concurrent readers while notifications
+ * are sent. This will also allow statmount()/listmount() to run
+ * concurrently.
+ */
+ downgrade_write(&namespace_sem);
+ notify_mnt_list();
+ up_read(&namespace_sem);
+ } else {
+ up_write(&namespace_sem);
+ }
shrink_dentry_list(&list);
@@ -1846,6 +1923,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
+
+ /*
+ * At this point p->mnt_ns is NULL, notification will be queued
+ * only if
+ *
+ * - p->prev_ns is non-NULL *and*
+ * - p->prev_ns->n_fsnotify_marks is non-NULL
+ *
+ * This will preclude queuing the mount if this is a cleanup
+ * after a failed copy_tree() or destruction of an anonymous
+ * namespace, etc.
+ */
+ mnt_notify_add(p);
}
}
@@ -2026,6 +2116,7 @@ static void warn_mandlock(void)
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
+ struct super_block *sb = path->dentry->d_sb;
if (!may_mount())
return -EPERM;
@@ -2035,7 +2126,7 @@ static int can_umount(const struct path *path, int flags)
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -2145,16 +2236,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
}
}
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
+{
+ if (!is_mnt_ns_file(dentry))
+ return NULL;
+
+ return to_mnt_ns(get_proc_ns(dentry->d_inode));
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
* mount namespace loop?
*/
- struct mnt_namespace *mnt_ns;
- if (!is_mnt_ns_file(dentry))
+ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
+
+ if (!mnt_ns)
return false;
- mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -2246,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path)
static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
+{
+ /*
+ * This mount belonged to an anonymous mount namespace
+ * but was moved to a non-anonymous mount namespace and
+ * then unmounted.
+ */
+ if (unlikely(!mnt_ns))
+ return false;
+
+ /*
+ * This mount belongs to a non-anonymous mount namespace
+ * and we know that such a mount can never transition to
+ * an anonymous mount namespace again.
+ */
+ if (!is_anon_ns(mnt_ns)) {
+ /*
+ * A detached mount either belongs to an anonymous mount
+ * namespace or a non-anonymous mount namespace. It
+ * should never belong to something purely internal.
+ */
+ VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
+ return false;
+ }
+
+ return true;
+}
+
void dissolve_on_fput(struct vfsmount *mnt)
{
struct mnt_namespace *ns;
- namespace_lock();
- lock_mount_hash();
- ns = real_mount(mnt)->mnt_ns;
- if (ns) {
- if (is_anon_ns(ns))
- umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
- else
- ns = NULL;
+ struct mount *m = real_mount(mnt);
+
+ scoped_guard(rcu) {
+ if (!must_dissolve(READ_ONCE(m->mnt_ns)))
+ return;
}
- unlock_mount_hash();
- namespace_unlock();
- if (ns)
- free_mnt_ns(ns);
+
+ scoped_guard(rwsem_write, &namespace_sem) {
+ ns = m->mnt_ns;
+ if (!must_dissolve(ns))
+ return;
+
+ /*
+ * After must_dissolve() we know that this is a detached
+ * mount in an anonymous mount namespace.
+ *
+ * Now when mnt_has_parent() reports that this mount
+ * tree has a parent, we know that this anonymous mount
+ * tree has been moved to another anonymous mount
+ * namespace.
+ *
+ * So when closing this file we cannot unmount the mount
+ * tree. This will be done when the file referring to
+ * the root of the anonymous mount namespace will be
+ * closed (It could already be closed but it would sync
+ * on @namespace_sem and wait for us to finish.).
+ */
+ if (mnt_has_parent(m))
+ return;
+
+ lock_mount_hash();
+ umount_tree(m, UMOUNT_CONNECTED);
+ unlock_mount_hash();
+ }
+
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+ free_mnt_ns(ns);
}
void drop_collected_mounts(struct vfsmount *mnt)
@@ -2287,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree. Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+ struct mount *p;
+ bool ret = false;
+
+ lock_mount_hash();
+ for (p = subtree; p; p = next_mnt(p, subtree))
+ if (mnt_ns_loop(p->mnt.mnt_root))
+ goto out;
+
+ ret = true;
+out:
+ unlock_mount_hash();
+ return ret;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -2295,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
+ * This assumes caller has called or done the equivalent of may_mount().
+ *
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
@@ -2302,30 +2478,36 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- down_read(&namespace_sem);
+ scoped_guard(rwsem_read, &namespace_sem)
if (IS_MNT_UNBINDABLE(old_mnt))
- goto invalid;
+ return ERR_PTR(-EINVAL);
- if (!check_mnt(old_mnt))
- goto invalid;
+ if (mnt_has_parent(old_mnt)) {
+ if (!check_mnt(old_mnt))
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (!is_mounted(&old_mnt->mnt))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure this isn't something purely kernel internal. */
+ if (!is_anon_ns(old_mnt->mnt_ns))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure we don't create mount namespace loops. */
+ if (!check_for_nsfs_mounts(old_mnt))
+ return ERR_PTR(-EINVAL);
+ }
if (has_locked_children(old_mnt, path->dentry))
- goto invalid;
+ return ERR_PTR(-EINVAL);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
- up_read(&namespace_sem);
-
if (IS_ERR(new_mnt))
- return ERR_CAST(new_mnt);
+ return ERR_PTR(-EINVAL);
/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;
-
return &new_mnt->mnt;
-
-invalid:
- up_read(&namespace_sem);
- return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2424,6 +2606,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
enum mnt_tree_flags_t {
MNT_TREE_MOVE = BIT(0),
MNT_TREE_BENEATH = BIT(1),
+ MNT_TREE_PROPAGATION = BIT(2),
};
/**
@@ -2547,6 +2730,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
dest_mp = smp;
unhash_mnt(source_mnt);
attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
+ mnt_notify_add(source_mnt);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
@@ -2773,6 +2957,71 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
+/* may_copy_tree() - check if a mount tree can be copied
+ * @path: path to the mount tree to be copied
+ *
+ * This helper checks if the caller may copy the mount tree starting
+ * from @path->mnt. The caller may copy the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller tries to copy an nfs mount referring to a mount
+ * namespace, i.e., the caller is trying to copy a mount namespace
+ * entry from nsfs.
+ * (3) The caller tries to copy a pidfs mount referring to a pidfd.
+ * (4) The caller is trying to copy a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * This is not strictly necessary. The current semantics of the new
+ * mount api enforce that the caller must be located in the same
+ * mount namespace as the mount tree it interacts with. Using the
+ * origin sequence number preserves these semantics even for
+ * anonymous mount namespaces. However, one could envision extending
+ * the api to directly operate across mount namespace if needed.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to copy the mount tree.
+ *
+ * Returns true if the mount tree can be copied, false otherwise.
+ */
+static inline bool may_copy_tree(struct path *path)
+{
+ struct mount *mnt = real_mount(path->mnt);
+ const struct dentry_operations *d_op;
+
+ if (check_mnt(mnt))
+ return true;
+
+ d_op = path->dentry->d_op;
+ if (d_op == &ns_dentry_operations)
+ return true;
+
+ if (d_op == &pidfs_dentry_operations)
+ return true;
+
+ if (!is_mounted(path->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
@@ -2780,13 +3029,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
- if (!check_mnt(old)) {
- const struct dentry_operations *d_op = old_path->dentry->d_op;
-
- if (d_op != &ns_dentry_operations &&
- d_op != &pidfs_dentry_operations)
- return mnt;
- }
+ if (!may_copy_tree(old_path))
+ return mnt;
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;
@@ -2853,15 +3097,30 @@ out:
static struct file *open_detached_copy(struct path *path, bool recursive)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
+ struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
struct file *file;
+ ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
return ERR_CAST(ns);
namespace_lock();
+
+ /*
+ * Record the sequence number of the source mount namespace.
+ * This needs to hold namespace_sem to ensure that the mount
+ * doesn't get attached.
+ */
+ if (is_mounted(path->mnt)) {
+ src_mnt_ns = real_mount(path->mnt)->mnt_ns;
+ if (is_anon_ns(src_mnt_ns))
+ ns->seq_origin = src_mnt_ns->seq_origin;
+ else
+ ns->seq_origin = src_mnt_ns->seq;
+ }
+
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
namespace_unlock();
@@ -2889,24 +3148,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
-SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
- struct file *file;
- struct path path;
+ int ret;
+ struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
- int error;
- int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
@@ -2916,27 +3173,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
- return -EPERM;
+ return ERR_PTR(-EPERM);
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (detached)
+ return open_detached_copy(&path, flags & AT_RECURSIVE);
+
+ return dentry_open(&path, O_PATH, current_cred());
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ int fd;
+ struct file *file __free(fput) = NULL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (unlikely(error)) {
- file = ERR_PTR(error);
- } else {
- if (detached)
- file = open_detached_copy(&path, flags & AT_RECURSIVE);
- else
- file = dentry_open(&path, O_PATH, current_cred());
- path_put(&path);
- }
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- fd_install(fd, file);
+ fd_install(fd, no_free_ptr(file));
return fd;
}
@@ -3123,28 +3385,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-/*
- * Check that there aren't references to earlier/same mount namespaces in the
- * specified subtree. Such references can act as pins for mount namespaces
- * that aren't checked by the mount-cycle checking code, thereby allowing
- * cycles to be made.
- */
-static bool check_for_nsfs_mounts(struct mount *subtree)
-{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
- if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
-}
-
static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
@@ -3320,8 +3560,56 @@ static int can_move_mount_beneath(const struct path *from,
return 0;
}
-static int do_move_mount(struct path *old_path, struct path *new_path,
- bool beneath)
+/* may_use_mount() - check if a mount tree can be used
+ * @mnt: vfsmount to be used
+ *
+ * This helper checks if the caller may use the mount tree starting
+ * from @path->mnt. The caller may use the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller is trying to use a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to use the mount tree.
+ *
+ * Returns true if the mount tree can be used, false otherwise.
+ */
+static inline bool may_use_mount(struct mount *mnt)
+{
+ if (check_mnt(mnt))
+ return true;
+
+ /*
+ * Make sure that noone unmounted the target path or somehow
+ * managed to get their hands on something purely kernel
+ * internal.
+ */
+ if (!is_mounted(&mnt->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+static int do_move_mount(struct path *old_path,
+ struct path *new_path, enum mnt_tree_flags_t flags)
{
struct mnt_namespace *ns;
struct mount *p;
@@ -3329,8 +3617,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
struct mount *parent;
struct mountpoint *mp, *old_mp;
int err;
- bool attached;
- enum mnt_tree_flags_t flags = 0;
+ bool attached, beneath = flags & MNT_TREE_BENEATH;
mp = do_lock_mount(new_path, beneath);
if (IS_ERR(mp))
@@ -3346,8 +3633,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
ns = old->mnt_ns;
err = -EINVAL;
- /* The mountpoint must be in our namespace. */
- if (!check_mnt(p))
+ if (!may_use_mount(p))
goto out;
/* The thing moved must be mounted... */
@@ -3358,6 +3644,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
+ if (is_anon_ns(ns)) {
+ /*
+ * Ending up with two files referring to the root of the
+ * same anonymous mount namespace would cause an error
+ * as this would mean trying to move the same mount
+ * twice into the mount tree which would be rejected
+ * later. But be explicit about it right here.
+ */
+ if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
+ goto out;
+
+ /*
+ * If this is an anonymous mount tree ensure that mount
+ * propagation can detect mounts that were just
+ * propagated to the target mount tree so we don't
+ * propagate onto them.
+ */
+ ns->mntns_flags |= MNTNS_PROPAGATING;
+ } else if (is_anon_ns(p->mnt_ns)) {
+ /*
+ * Don't allow moving an attached mount tree to an
+ * anonymous mount tree.
+ */
+ goto out;
+ }
+
if (old->mnt.mnt_flags & MNT_LOCKED)
goto out;
@@ -3400,6 +3712,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (err)
goto out;
+ if (is_anon_ns(ns))
+ ns->mntns_flags &= ~MNTNS_PROPAGATING;
+
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
@@ -3408,10 +3723,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
out:
unlock_mount(mp);
if (!err) {
- if (attached)
+ if (attached) {
mntput_no_expire(parent);
- else
+ } else {
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
free_mnt_ns(ns);
+ }
}
return err;
}
@@ -3428,7 +3746,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, false);
+ err = do_move_mount(&old_path, path, 0);
path_put(&old_path);
return err;
}
@@ -4269,6 +4587,21 @@ err_unlock:
return ret;
}
+static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+ enum mnt_tree_flags_t mflags)
+{
+ int ret;
+
+ ret = security_move_mount(from_path, to_path);
+ if (ret)
+ return ret;
+
+ if (mflags & MNT_TREE_PROPAGATION)
+ return do_set_group(from_path, to_path);
+
+ return do_move_mount(from_path, to_path, mflags);
+}
+
/*
* Move a mount from one place to another. In combination with
* fsopen()/fsmount() this is used to install a new mount and in combination
@@ -4282,8 +4615,12 @@ SYSCALL_DEFINE5(move_mount,
int, to_dfd, const char __user *, to_pathname,
unsigned int, flags)
{
- struct path from_path, to_path;
- unsigned int lflags;
+ struct path to_path __free(path_put) = {};
+ struct path from_path __free(path_put) = {};
+ struct filename *to_name __free(putname) = NULL;
+ struct filename *from_name __free(putname) = NULL;
+ unsigned int lflags, uflags;
+ enum mnt_tree_flags_t mflags = 0;
int ret = 0;
if (!may_mount())
@@ -4296,43 +4633,53 @@ SYSCALL_DEFINE5(move_mount,
(MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
return -EINVAL;
- /* If someone gives a pathname, they aren't permitted to move
- * from an fd that requires unmount as we can't get at the flag
- * to clear it afterwards.
- */
+ if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
+ if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
+
lflags = 0;
if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
-
- ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
- if (ret < 0)
- return ret;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ from_name = getname_maybe_null(from_pathname, uflags);
+ if (IS_ERR(from_name))
+ return PTR_ERR(from_name);
lflags = 0;
if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ to_name = getname_maybe_null(to_pathname, uflags);
+ if (IS_ERR(to_name))
+ return PTR_ERR(to_name);
+
+ if (!to_name && to_dfd >= 0) {
+ CLASS(fd_raw, f_to)(to_dfd);
+ if (fd_empty(f_to))
+ return -EBADF;
+
+ to_path = fd_file(f_to)->f_path;
+ path_get(&to_path);
+ } else {
+ ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
+ if (ret)
+ return ret;
+ }
- ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
- if (ret < 0)
- goto out_from;
+ if (!from_name && from_dfd >= 0) {
+ CLASS(fd_raw, f_from)(from_dfd);
+ if (fd_empty(f_from))
+ return -EBADF;
- ret = security_move_mount(&from_path, &to_path);
- if (ret < 0)
- goto out_to;
+ return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
+ }
- if (flags & MOVE_MOUNT_SET_GROUP)
- ret = do_set_group(&from_path, &to_path);
- else
- ret = do_move_mount(&from_path, &to_path,
- (flags & MOVE_MOUNT_BENEATH));
+ ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
+ if (ret)
+ return ret;
-out_to:
- path_put(&to_path);
-out_from:
- path_put(&from_path);
- return ret;
+ return vfs_move_mount(&from_path, &to_path, mflags);
}
/*
@@ -4468,6 +4815,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
list_del_init(&new_mnt->mnt_expire);
put_mountpoint(root_mp);
unlock_mount_hash();
+ mnt_notify_add(root_mnt);
+ mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
error = 0;
out4:
@@ -4512,11 +4861,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return -EINVAL;
/*
- * Once a mount has been idmapped we don't allow it to change its
- * mapping. It makes things simpler and callers can just create
- * another bind-mount they can idmap if they want to.
+ * We only allow an mount to change it's idmapping if it has
+ * never been accessible to userspace.
*/
- if (is_idmapped_mnt(m))
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
@@ -4576,7 +4924,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
break;
}
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
return 0;
}
@@ -4606,18 +4954,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
+ struct mnt_idmap *old_idmap;
+
if (!kattr->mnt_idmap)
return;
- /*
- * Pairs with smp_load_acquire() in mnt_idmap().
- *
- * Since we only allow a mount to change the idmapping once and
- * verified this in can_idmap_mount() we know that the mount has
- * @nop_mnt_idmap attached to it. So there's no need to drop any
- * references.
- */
+ old_idmap = mnt_idmap(&mnt->mnt);
+
+ /* Pairs with smp_load_acquire() in mnt_idmap(). */
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
+ mnt_idmap_put(old_idmap);
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4637,7 +4983,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
break;
}
touch_mnt_namespace(mnt->mnt_ns);
@@ -4667,7 +5013,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
- err = invent_group_ids(mnt, kattr->recurse);
+ err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
if (err) {
namespace_unlock();
return err;
@@ -4718,7 +5064,7 @@ out:
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
struct ns_common *ns;
struct user_namespace *mnt_userns;
@@ -4726,13 +5072,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
- /*
- * We currently do not support clearing an idmapped mount. If this ever
- * is a use-case we can revisit this but for now let's keep it simple
- * and not allow it.
- */
- if (attr->attr_clr & MOUNT_ATTR_IDMAP)
- return -EINVAL;
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
+ /*
+ * We can only remove an idmapping if it's never been
+ * exposed to userspace.
+ */
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
+ return -EINVAL;
+
+ /*
+ * Removal of idmappings is equivalent to setting
+ * nop_mnt_idmap.
+ */
+ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
+ kattr->mnt_idmap = &nop_mnt_idmap;
+ return 0;
+ }
+ }
if (attr->userns_fd > INT_MAX)
return -EINVAL;
@@ -4769,22 +5125,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
- unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
- *kattr = (struct mount_kattr) {
- .lookup_flags = lookup_flags,
- .recurse = !!(flags & AT_RECURSIVE),
- };
-
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
@@ -4832,35 +5174,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
return -EINVAL;
}
- return build_mount_idmapped(attr, usize, kattr, flags);
+ return build_mount_idmapped(attr, usize, kattr);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
- put_user_ns(kattr->mnt_userns);
- kattr->mnt_userns = NULL;
+ if (kattr->mnt_userns) {
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
+ }
if (kattr->mnt_idmap)
mnt_idmap_put(kattr->mnt_idmap);
}
-SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
- unsigned int, flags, struct mount_attr __user *, uattr,
- size_t, usize)
+static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
- int err;
- struct path target;
+ int ret;
struct mount_attr attr;
- struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
- if (flags & ~(AT_EMPTY_PATH |
- AT_RECURSIVE |
- AT_SYMLINK_NOFOLLOW |
- AT_NO_AUTOMOUNT))
- return -EINVAL;
-
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
@@ -4869,9 +5204,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (!may_mount())
return -EPERM;
- err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
- if (err)
- return err;
+ ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+ if (ret)
+ return ret;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
@@ -4879,7 +5214,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
attr.propagation == 0)
return 0;
- err = build_mount_kattr(&attr, usize, &kattr, flags);
+ return build_mount_kattr(&attr, usize, kattr);
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+ unsigned int, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ int err;
+ struct path target;
+ struct mount_kattr kattr;
+ unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+ if (flags & ~(AT_EMPTY_PATH |
+ AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW |
+ AT_NO_AUTOMOUNT))
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ kattr = (struct mount_kattr) {
+ .lookup_flags = lookup_flags,
+ };
+
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ err = copy_mount_setattr(uattr, usize, &kattr);
if (err)
return err;
@@ -4892,6 +5259,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
+ unsigned, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ struct file __free(fput) *file = NULL;
+ int fd;
+
+ if (!uattr && usize)
+ return -EINVAL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (uattr) {
+ int ret;
+ struct mount_kattr kattr = {};
+
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ ret = copy_mount_setattr(uattr, usize, &kattr);
+ if (ret)
+ return ret;
+
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
+
+ finish_mount_kattr(&kattr);
+ }
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ fd_install(fd, no_free_ptr(file));
+ return fd;
+}
+
int show_path(struct seq_file *m, struct dentry *root)
{
if (root->d_sb->s_op->show_path)
@@ -4915,6 +5323,7 @@ struct kstatmount {
struct statmount __user *buf;
size_t bufsize;
struct vfsmount *mnt;
+ struct mnt_idmap *idmap;
u64 mask;
struct path root;
struct statmount sm;
@@ -5184,6 +5593,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, true);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_uidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_UIDMAP;
+ return 0;
+}
+
+static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, false);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_gidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_GIDMAP;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret = 0;
@@ -5231,6 +5680,14 @@ static int statmount_string(struct kstatmount *s, u64 flag)
offp = &sm->sb_source;
ret = statmount_sb_source(s, seq);
break;
+ case STATMOUNT_MNT_UIDMAP:
+ sm->mnt_uidmap = start;
+ ret = statmount_mnt_uidmap(s, seq);
+ break;
+ case STATMOUNT_MNT_GIDMAP:
+ sm->mnt_gidmap = start;
+ ret = statmount_mnt_gidmap(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -5306,7 +5763,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
* We have to find the first mount in our ns and use that, however it
* may not exist, so handle that properly.
*/
- if (RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_empty(ns))
return -ENOENT;
first = child = ns->root;
@@ -5323,6 +5780,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
+ STATMOUNT_MNT_BASIC | \
+ STATMOUNT_PROPAGATE_FROM | \
+ STATMOUNT_MNT_ROOT | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_NS_ID | \
+ STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | \
+ STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | \
+ STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_SUPPORTED_MASK)
+
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5331,7 +5803,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
int err;
/* Has the namespace already been emptied? */
- if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_id && mnt_ns_empty(ns))
return -ENOENT;
s->mnt = lookup_mnt_in_ns(mnt_id, ns);
@@ -5356,6 +5828,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_SB_BASIC)
statmount_sb_basic(s);
@@ -5389,12 +5862,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_SB_SOURCE)
err = statmount_string(s, STATMOUNT_SB_SOURCE);
+ if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
+
+ if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
+ if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
+ s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
+ s->sm.supported_mask = STATMOUNT_SUPPORTED;
+ }
+
if (err)
return err;
+ /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
+ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
+
return 0;
}
@@ -5412,7 +5899,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
- STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2b04038b0e40..bc957487f6ec 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1532,7 +1532,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
{
if (NFS_PROTO(dir)->version == 2)
return 0;
- return flags & LOOKUP_EXCL;
+ return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) ==
+ (LOOKUP_CREATE | LOOKUP_EXCL);
}
/*
@@ -2421,11 +2422,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod);
/*
* See comments for nfs_proc_create regarding failed operations.
*/
-int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct iattr attr;
- int error;
+ struct dentry *ret;
dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
@@ -2434,14 +2435,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
attr.ia_mode = mode | S_IFDIR;
trace_nfs_mkdir_enter(dir, dentry);
- error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
- trace_nfs_mkdir_exit(dir, dentry, error);
- if (error != 0)
- goto out_err;
- return 0;
-out_err:
- d_drop(dentry);
- return error;
+ ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret));
+ return ret;
}
EXPORT_SYMBOL_GPL(nfs_mkdir);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index fae2c7ae4acc..1ac1d3eec517 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -400,8 +400,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
void nfs_d_prune_case_insensitive_aliases(struct inode *inode);
int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, bool);
-int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
int nfs_rmdir(struct inode *, struct dentry *);
int nfs_unlink(struct inode *, struct dentry *);
int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0c3bc98cd999..755ed3c37051 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -578,13 +578,13 @@ out:
return status;
}
-static int
+static struct dentry *
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- struct dentry *d_alias;
- int status = -ENOMEM;
+ struct dentry *ret = ERR_PTR(-ENOMEM);
+ int status;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
if (data == NULL)
goto out;
- status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
- if (status)
+ ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode,
+ &default_acl, &acl));
+ if (IS_ERR(ret))
goto out;
data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
@@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
data->arg.mkdir.len = dentry->d_name.len;
data->arg.mkdir.sattr = sattr;
- d_alias = nfs3_do_create(dir, dentry, data);
- status = PTR_ERR_OR_ZERO(d_alias);
+ ret = nfs3_do_create(dir, dentry, data);
- if (status != 0)
+ if (IS_ERR(ret))
goto out_release_acls;
- if (d_alias)
- dentry = d_alias;
+ if (ret)
+ dentry = ret;
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+ if (status) {
+ dput(ret);
+ ret = ERR_PTR(status);
+ }
- dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
- dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret));
+ return ret;
}
static int
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6e95db6c17e9..70c8ea943019 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3154,9 +3154,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (d_really_is_negative(dentry)) {
struct dentry *alias;
d_drop(dentry);
- alias = d_exact_alias(dentry, state->inode);
- if (!alias)
- alias = d_splice_alias(igrab(state->inode), dentry);
+ alias = d_splice_alias(igrab(state->inode), dentry);
/* d_splice_alias() can't fail here - it's a non-directory */
if (alias) {
dput(ctx->dentry);
@@ -5139,9 +5137,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- /* Creating a directory bumps nlink in the parent */
- if (data->arg.ftype == NF4DIR)
- nfs4_inc_nlink_locked(dir);
nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
@@ -5151,6 +5146,25 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
return status;
}
+static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry,
+ struct nfs4_createdata *data)
+{
+ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ &data->arg.seq_args, &data->res.seq_res, 1);
+
+ if (status)
+ return ERR_PTR(status);
+
+ spin_lock(&dir->i_lock);
+ /* Creating a directory bumps nlink in the parent */
+ nfs4_inc_nlink_locked(dir);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ spin_unlock(&dir->i_lock);
+ return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+}
+
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
nfs4_label_free(data->fattr.label);
@@ -5207,32 +5221,34 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
return err;
}
-static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, struct nfs4_label *label)
+static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr,
+ struct nfs4_label *label)
{
struct nfs4_createdata *data;
- int status = -ENOMEM;
+ struct dentry *ret = ERR_PTR(-ENOMEM);
data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
if (data == NULL)
goto out;
data->arg.label = label;
- status = nfs4_do_create(dir, dentry, data);
+ ret = nfs4_do_mkdir(dir, dentry, data);
nfs4_free_createdata(data);
out:
- return status;
+ return ret;
}
-static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = {
.interruptible = true,
};
struct nfs4_label l, *label;
+ struct dentry *alias;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5240,14 +5256,15 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_mode &= ~current_umask();
do {
- err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ alias = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ err = PTR_ERR_OR_ZERO(alias);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
err = nfs4_handle_exception(NFS_SERVER(dir), err,
&exception);
} while (exception.retry);
nfs4_label_release_security(label);
- return err;
+ return alias;
}
static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77920a2e3cef..63e71310b9f6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -446,13 +446,14 @@ out:
return status;
}
-static int
+static struct dentry *
nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
};
+ struct dentry *alias = NULL;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
- if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ if (status == 0) {
+ alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ status = PTR_ERR_OR_ZERO(alias);
+ } else
+ alias = ERR_PTR(status);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ return alias;
}
static int
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 28f4d5311c40..c1d9bd07285f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -233,9 +233,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* as well be forgiving and just succeed silently.
*/
goto out_put;
- status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ if (IS_ERR(dentry))
+ status = PTR_ERR(dentry);
out_put:
- dput(dentry);
+ if (!status)
+ dput(dentry);
out_unlock:
inode_unlock(d_inode(dir));
if (status == 0) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 29cb7b812d71..34d7aa531662 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1461,7 +1461,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct inode *dirp;
struct iattr *iap = attrs->na_iattr;
__be32 err;
- int host_err;
+ int host_err = 0;
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
@@ -1488,28 +1488,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
- host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
- if (!host_err && unlikely(d_unhashed(dchild))) {
- struct dentry *d;
- d = lookup_one_len(dchild->d_name.name,
- dchild->d_parent,
- dchild->d_name.len);
- if (IS_ERR(d)) {
- host_err = PTR_ERR(d);
- break;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = nfserr_serverfault;
- goto out;
- }
+ dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
+ if (IS_ERR(dchild)) {
+ host_err = PTR_ERR(dchild);
+ } else if (d_is_negative(dchild)) {
+ err = nfserr_serverfault;
+ goto out;
+ } else if (unlikely(dchild != resfhp->fh_dentry)) {
dput(resfhp->fh_dentry);
- resfhp->fh_dentry = dget(d);
- err = fh_update(resfhp);
- dput(dchild);
- dchild = d;
- if (err)
- goto out;
+ resfhp->fh_dentry = dget(dchild);
}
break;
case S_IFCHR:
@@ -1530,7 +1517,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
out:
- dput(dchild);
+ if (!IS_ERR(dchild))
+ dput(dchild);
return err;
out_nfserr:
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 953fbd5f0851..40f4b1a28705 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
-static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct nilfs_transaction_info ti;
@@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
- return err;
+ return ERR_PTR(err);
inc_nlink(dir);
@@ -258,7 +258,7 @@ out:
else
nilfs_transaction_abort(dir->i_sb);
- return err;
+ return ERR_PTR(err);
out_fail:
drop_nlink(inode);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 95646f7c46ca..6d386080faf2 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
return fanotify_error_event_equal(FANOTIFY_EE(old),
FANOTIFY_EE(new));
+ case FANOTIFY_EVENT_TYPE_MNT:
+ return false;
default:
WARN_ON_ONCE(1);
}
@@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- if (!fid_mode) {
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (data_type != FSNOTIFY_EVENT_MNT)
+ return 0;
+ } else if (!fid_mode) {
/* Do we have path to open a file descriptor? */
if (!path)
return 0;
@@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
return &pevent->fae;
}
+static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp)
+{
+ struct fanotify_mnt_event *pevent;
+
+ pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp);
+ if (!pevent)
+ return NULL;
+
+ pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT;
+ pevent->mnt_id = mnt_id;
+
+ return &pevent->fae;
+}
+
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
int data_type,
gfp_t gfp)
@@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event(
fid_mode);
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
+ u64 mnt_id = fsnotify_data_mnt_id(data, data_type);
struct mem_cgroup *old_memcg;
struct dentry *moved = NULL;
struct inode *child = NULL;
@@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event(
moved, &hash, gfp);
} else if (fid_mode) {
event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
- } else {
+ } else if (path) {
event = fanotify_alloc_path_event(path, &hash, gfp);
+ } else if (mnt_id) {
+ event = fanotify_alloc_mnt_event(mnt_id, gfp);
+ } else {
+ WARN_ON_ONCE(1);
}
if (!event)
@@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24);
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
mask, data, data_type, dir);
@@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group,
mempool_free(fee, &group->fanotify_data.error_events_pool);
}
+static void fanotify_free_mnt_event(struct fanotify_event *event)
+{
+ kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event));
+}
+
static void fanotify_free_event(struct fsnotify_group *group,
struct fsnotify_event *fsn_event)
{
@@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
fanotify_free_error_event(group, event);
break;
+ case FANOTIFY_EVENT_TYPE_MNT:
+ fanotify_free_mnt_event(event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index c12cbc270539..b44e70e44be6 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_fid_event_cachep;
extern struct kmem_cache *fanotify_path_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+extern struct kmem_cache *fanotify_mnt_event_cachep;
/* Possible states of the permission event */
enum {
@@ -244,6 +245,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
+ FANOTIFY_EVENT_TYPE_MNT,
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -409,12 +411,23 @@ struct fanotify_path_event {
struct path path;
};
+struct fanotify_mnt_event {
+ struct fanotify_event fae;
+ u64 mnt_id;
+};
+
static inline struct fanotify_path_event *
FANOTIFY_PE(struct fanotify_event *event)
{
return container_of(event, struct fanotify_path_event, fae);
}
+static inline struct fanotify_mnt_event *
+FANOTIFY_ME(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_mnt_event, fae);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
+static inline bool fanotify_is_mnt_event(u32 mask)
+{
+ return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH);
+}
+
static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ba3e2d09eb44..f2d840ae4ded 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init;
struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
(sizeof(struct fanotify_event_info_error))
#define FANOTIFY_RANGE_INFO_LEN \
(sizeof(struct fanotify_event_info_range))
+#define FANOTIFY_MNT_INFO_LEN \
+ (sizeof(struct fanotify_event_info_mnt))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode,
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
}
+ if (fanotify_is_mnt_event(event->mask))
+ event_len += FANOTIFY_MNT_INFO_LEN;
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_LEN;
@@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_mnt_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_mnt info = { };
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
+ info.hdr.len = FANOTIFY_MNT_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.mnt_id = FANOTIFY_ME(event)->mnt_id;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static size_t copy_error_info_to_user(struct fanotify_event *event,
char __user *buf, int count)
{
@@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_mnt_event(event->mask)) {
+ ret = copy_mnt_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
return -EINVAL;
+ /* Don't allow mixing mnt events with inode events for now */
+ if (flags & FAN_REPORT_MNT) {
+ if (class != FAN_CLASS_NOTIF)
+ return -EINVAL;
+ if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
+ return -EINVAL;
+ }
+
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
return -EINVAL;
@@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
struct inode *inode = NULL;
- struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
@@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
- void *obj;
+ void *obj = NULL;
u32 umask = 0;
int ret;
@@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
case FAN_MARK_FILESYSTEM:
obj_type = FSNOTIFY_OBJ_TYPE_SB;
break;
+ case FAN_MARK_MNTNS:
+ obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
+ break;
default:
return -EINVAL;
}
@@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
group = fd_file(f)->private_data;
+ /* Only report mount events on mnt namespace */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (mask & ~FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type != FAN_MARK_MNTNS)
+ return -EINVAL;
+ } else {
+ if (mask & FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type == FAN_MARK_MNTNS)
+ return -EINVAL;
+ }
+
/*
* An unprivileged user is not allowed to setup mount nor filesystem
* marks. This also includes setting up such marks by a group that
@@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
return -EINVAL;
@@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* inode held in place by reference to path; group by fget on fd */
- if (mark_type == FAN_MARK_INODE) {
+ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
- } else {
- mnt = path.mnt;
- if (mark_type == FAN_MARK_MOUNT)
- obj = mnt;
- else
- obj = mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ obj = path.mnt;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
+ obj = path.mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ obj = mnt_ns_from_dentry(path.dentry);
}
+ ret = -EINVAL;
+ if (!obj)
+ goto path_put_and_out;
+
/*
* If some other task has this inode open for write we should not add
* an ignore mask, unless that ignore mask is supposed to survive
@@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
- ret = mnt ? -EINVAL : -EISDIR;
+ ret = !inode ? -EINVAL : -EISDIR;
/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
if (ignore == FAN_MARK_IGNORE &&
- (mnt || S_ISDIR(inode->i_mode)))
+ (!inode || S_ISDIR(inode->i_mode)))
goto path_put_and_out;
ret = 0;
@@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
- if (mnt || !S_ISDIR(inode->i_mode)) {
+ if (!inode || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
umask = FAN_EVENT_ON_CHILD;
/*
@@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
@@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void)
fanotify_perm_event_cachep =
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
+ fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);
fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index e933f9c65d90..1161eabf11ee 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
sb->s_dev, mflags, mark->mask, mark->ignore_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector);
+
+ seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n",
+ mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask);
}
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fae1b6d397ea..e2b4f17a48bb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
fsnotify_clear_marks_by_mount(mnt);
}
+void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
+{
+ fsnotify_clear_marks_by_mntns(mntns);
+}
+
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
* @sb: superblock being unmounted.
@@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
file_name, cookie, iter_info);
}
-static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
+static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp)
{
struct fsnotify_mark_connector *conn;
struct hlist_node *node = NULL;
@@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
{
const struct path *path = fsnotify_data_path(data, data_type);
struct super_block *sb = fsnotify_data_sb(data, data_type);
- struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+ const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
+ struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL;
struct fsnotify_iter_info iter_info = {};
struct mount *mnt = NULL;
struct inode *inode2 = NULL;
struct dentry *moved;
int inode2_type;
int ret = 0;
- __u32 test_mask, marks_mask;
+ __u32 test_mask, marks_mask = 0;
if (path)
mnt = real_mount(path->mnt);
@@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if ((!sbinfo || !sbinfo->sb_marks) &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
- (!inode2 || !inode2->i_fsnotify_marks))
+ (!inode2 || !inode2->i_fsnotify_marks) &&
+ (!mnt_data || !mnt_data->ns->n_fsnotify_marks))
return 0;
- marks_mask = READ_ONCE(sb->s_fsnotify_mask);
+ if (sb)
+ marks_mask |= READ_ONCE(sb->s_fsnotify_mask);
if (mnt)
marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask);
if (inode)
marks_mask |= READ_ONCE(inode->i_fsnotify_mask);
if (inode2)
marks_mask |= READ_ONCE(inode2->i_fsnotify_mask);
-
+ if (mnt_data)
+ marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask);
/*
* If this is a modify event we may need to clear some ignore masks.
@@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.marks[inode2_type] =
fsnotify_first_mark(&inode2->i_fsnotify_marks);
}
+ if (mnt_data) {
+ iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] =
+ fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks);
+ }
/*
* We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
@@ -708,11 +721,31 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
}
#endif
+void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ struct fsnotify_mnt data = {
+ .ns = ns,
+ .mnt_id = real_mount(mnt)->mnt_id_unique,
+ };
+
+ if (WARN_ON_ONCE(!ns))
+ return;
+
+ /*
+ * This is an optimization as well as making sure fsnotify_init() has
+ * been called.
+ */
+ if (!ns->n_fsnotify_marks)
+ return;
+
+ fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0);
+}
+
static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 663759ed6fbc..5950c7a67f41 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb(
return conn->obj;
}
+static inline struct mnt_namespace *fsnotify_conn_mntns(
+ struct fsnotify_mark_connector *conn)
+{
+ return conn->obj;
+}
+
static inline struct super_block *fsnotify_object_sb(void *obj,
enum fsnotify_obj_type obj_type)
{
@@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}
+static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
+{
+ fsnotify_destroy_marks(&mntns->n_fsnotify_marks);
+}
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4981439e6209..798340db69d7 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj,
return &real_mount(obj)->mnt_fsnotify_marks;
case FSNOTIFY_OBJ_TYPE_SB:
return fsnotify_sb_marks(obj);
+ case FSNOTIFY_OBJ_TYPE_MNTNS:
+ return &((struct mnt_namespace *)obj)->n_fsnotify_marks;
default:
return NULL;
}
@@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
+ else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS)
+ return &fsnotify_conn_mntns(conn)->n_fsnotify_mask;
return NULL;
}
@@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object(
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0;
}
rcu_assign_pointer(*connp, NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
return inode;
}
@@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
* Attach the sb info before attaching a connector to any object on sb.
* The sb info will remain attached as long as sb lives.
*/
- if (!fsnotify_sb_info(sb)) {
+ if (sb && !fsnotify_sb_info(sb)) {
err = fsnotify_attach_info_to_sb(sb);
if (err)
return err;
@@ -770,7 +777,8 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
diff --git a/fs/nsfs.c b/fs/nsfs.c
index f7fddf8ecf73..59aa801347a7 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -151,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
return 0;
}
+static bool nsfs_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case NS_GET_USERNS:
+ case NS_GET_PARENT:
+ case NS_GET_NSTYPE:
+ case NS_GET_OWNER_UID:
+ case NS_GET_MNTNS_ID:
+ case NS_GET_PID_FROM_PIDNS:
+ case NS_GET_TGID_FROM_PIDNS:
+ case NS_GET_PID_IN_PIDNS:
+ case NS_GET_TGID_IN_PIDNS:
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ /* Extensible ioctls require some extra handling. */
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(NS_MNT_GET_INFO):
+ case _IOC_NR(NS_MNT_GET_NEXT):
+ case _IOC_NR(NS_MNT_GET_PREV):
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ return false;
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
struct pid_namespace *pid_ns;
struct task_struct *tsk;
- struct ns_common *ns = get_proc_ns(file_inode(filp));
+ struct ns_common *ns;
struct mnt_namespace *mnt_ns;
bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
+ if (!nsfs_ioctl_valid(ioctl))
+ return -ENOIOCTLCMD;
+
+ ns = get_proc_ns(file_inode(filp));
switch (ioctl) {
case NS_GET_USERNS:
return open_related_ns(ns, ns_get_owner);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index abf7e81584a9..652735a0b0c4 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -201,11 +201,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/*
* ntfs_mkdir- inode_operations::mkdir
*/
-static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
- NULL, 0, NULL);
+ return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
+ NULL, 0, NULL));
}
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 2a7f36643895..5130ec44e5e1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
-static int dlmfs_mkdir(struct mnt_idmap * idmap,
- struct inode * dir,
- struct dentry * dentry,
- umode_t mode)
+static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir,
+ struct dentry * dentry,
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
@@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap,
bail:
if (status < 0)
iput(inode);
- return status;
+ return ERR_PTR(status);
}
static int dlmfs_create(struct mnt_idmap *idmap,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0ec63a1a94b8..99278c8f0e24 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -644,10 +644,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
suballoc_loc, suballoc_bit);
}
-static int ocfs2_mkdir(struct mnt_idmap *idmap,
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
int ret;
@@ -657,7 +657,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap,
if (ret)
mlog_errno(ret);
- return ret;
+ return ERR_PTR(ret);
}
static int ocfs2_create(struct mnt_idmap *idmap,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 6bda275826d6..2ed541fccf33 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,10 +279,10 @@ out_free_inode:
return err;
}
-static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return omfs_add_node(dir, dentry, mode | S_IFDIR);
+ return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR));
}
static int omfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/open.c b/fs/open.c
index 1be20de9f283..a9063cca9911 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
-long vfs_truncate(const struct path *path, loff_t length)
+int vfs_truncate(const struct path *path, loff_t length)
{
struct mnt_idmap *idmap;
struct inode *inode;
- long error;
+ int error;
inode = path->dentry->d_inode;
@@ -123,7 +123,7 @@ mnt_drop_write_and_out:
}
EXPORT_SYMBOL_GPL(vfs_truncate);
-long do_sys_truncate(const char __user *pathname, loff_t length)
+int do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
@@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_ftruncate(struct file *file, loff_t length, int small)
+int do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
@@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small)
return error;
}
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
if (length < 0)
return -EINVAL;
@@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ int ret;
loff_t sum;
if (offset < 0 || len <= 0)
@@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void)
return override_creds(override_cred);
}
-static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
+static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
struct inode *inode;
@@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root,
}
EXPORT_SYMBOL(file_open_root);
-static long do_sys_openat2(int dfd, const char __user *filename,
- struct open_how *how)
+static int do_sys_openat2(int dfd, const char __user *filename,
+ struct open_how *how)
{
struct open_flags op;
- int fd = build_open_flags(how, &op);
struct filename *tmp;
+ int err, fd;
- if (fd)
- return fd;
+ err = build_open_flags(how, &op);
+ if (unlikely(err))
+ return err;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
- if (fd >= 0) {
+ if (likely(fd >= 0)) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
@@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename,
return fd;
}
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
@@ -1551,7 +1552,7 @@ int filp_close(struct file *filp, fl_owner_t id)
int retval;
retval = filp_flush(filp, id);
- fput(filp);
+ fput_close(filp);
return retval;
}
@@ -1577,13 +1578,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
* We're returning to user space. Don't bother
* with any delayed fput() cases.
*/
- __fput_sync(file);
+ fput_close_sync(file);
+
+ if (likely(retval == 0))
+ return 0;
/* can't restart close syscall because file table entry was cleared */
- if (unlikely(retval == -ERESTARTSYS ||
- retval == -ERESTARTNOINTR ||
- retval == -ERESTARTNOHAND ||
- retval == -ERESTART_RESTARTBLOCK))
+ if (retval == -ERESTARTSYS ||
+ retval == -ERESTARTNOINTR ||
+ retval == -ERESTARTNOHAND ||
+ retval == -ERESTART_RESTARTBLOCK)
retval = -EINTR;
return retval;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index d68372241b30..90c49c0de243 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
int buffer_index;
ssize_t ret;
size_t copy_amount;
- int open_for_read;
- int open_for_write;
+ bool open_for_read;
+ bool open_for_write;
new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
if (!new_op)
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aae6d2b8767d..5ac743c6bc2e 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -16,22 +16,22 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-static int orangefs_writepage_locked(struct page *page,
- struct writeback_control *wbc)
+static int orangefs_writepage_locked(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct orangefs_write_range *wr = NULL;
struct iov_iter iter;
struct bio_vec bv;
- size_t len, wlen;
+ size_t wlen;
ssize_t ret;
- loff_t off;
+ loff_t len, off;
- set_page_writeback(page);
+ folio_start_writeback(folio);
len = i_size_read(inode);
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
+ if (folio->private) {
+ wr = folio->private;
WARN_ON(wr->pos >= len);
off = wr->pos;
if (off + wr->len > len)
@@ -40,36 +40,27 @@ static int orangefs_writepage_locked(struct page *page,
wlen = wr->len;
} else {
WARN_ON(1);
- off = page_offset(page);
- if (off + PAGE_SIZE > len)
+ off = folio_pos(folio);
+ wlen = folio_size(folio);
+
+ if (wlen > len - off)
wlen = len - off;
- else
- wlen = PAGE_SIZE;
}
/* Should've been handled in orangefs_invalidate_folio. */
WARN_ON(off == len || off + wlen > len);
WARN_ON(wlen == 0);
- bvec_set_page(&bv, page, wlen, off % PAGE_SIZE);
+ bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off));
iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
if (ret < 0) {
- mapping_set_error(page->mapping, ret);
+ mapping_set_error(folio->mapping, ret);
} else {
ret = 0;
}
- kfree(detach_page_private(page));
- return ret;
-}
-
-static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- ret = orangefs_writepage_locked(page, wbc);
- unlock_page(page);
- end_page_writeback(page);
+ kfree(folio_detach_private(folio));
return ret;
}
@@ -79,33 +70,33 @@ struct orangefs_writepages {
kuid_t uid;
kgid_t gid;
int maxpages;
- int npages;
- struct page **pages;
+ int nfolios;
+ struct address_space *mapping;
+ struct folio **folios;
struct bio_vec *bv;
};
static int orangefs_writepages_work(struct orangefs_writepages *ow,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
- struct inode *inode = ow->pages[0]->mapping->host;
+ struct inode *inode = ow->mapping->host;
struct orangefs_write_range *wrp, wr;
struct iov_iter iter;
ssize_t ret;
- size_t len;
- loff_t off;
+ size_t start;
+ loff_t len, off;
int i;
len = i_size_read(inode);
- for (i = 0; i < ow->npages; i++) {
- set_page_writeback(ow->pages[i]);
- bvec_set_page(&ow->bv[i], ow->pages[i],
- min(page_offset(ow->pages[i]) + PAGE_SIZE,
- ow->off + ow->len) -
- max(ow->off, page_offset(ow->pages[i])),
- i == 0 ? ow->off - page_offset(ow->pages[i]) : 0);
+ start = offset_in_folio(ow->folios[0], ow->off);
+ for (i = 0; i < ow->nfolios; i++) {
+ folio_start_writeback(ow->folios[i]);
+ bvec_set_folio(&ow->bv[i], ow->folios[i],
+ folio_size(ow->folios[i]) - start, start);
+ start = 0;
}
- iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -116,40 +107,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
wr.gid = ow->gid;
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
0, &wr, NULL, NULL);
- if (ret < 0) {
- for (i = 0; i < ow->npages; i++) {
- mapping_set_error(ow->pages[i]->mapping, ret);
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
- } else {
+ if (ret < 0)
+ mapping_set_error(ow->mapping, ret);
+ else
ret = 0;
- for (i = 0; i < ow->npages; i++) {
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
+
+ for (i = 0; i < ow->nfolios; i++) {
+ wrp = folio_detach_private(ow->folios[i]);
+ kfree(wrp);
+ folio_end_writeback(ow->folios[i]);
+ folio_unlock(ow->folios[i]);
}
+
return ret;
}
static int orangefs_writepages_callback(struct folio *folio,
- struct writeback_control *wbc, void *data)
+ struct writeback_control *wbc, struct orangefs_writepages *ow)
{
- struct orangefs_writepages *ow = data;
struct orangefs_write_range *wr = folio->private;
int ret;
@@ -162,41 +137,41 @@ static int orangefs_writepages_callback(struct folio *folio,
}
ret = -1;
- if (ow->npages == 0) {
+ if (ow->nfolios == 0) {
ow->off = wr->pos;
ow->len = wr->len;
ow->uid = wr->uid;
ow->gid = wr->gid;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
ret = -1;
goto done;
}
if (ow->off + ow->len == wr->pos) {
ow->len += wr->len;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
done:
if (ret == -1) {
- if (ow->npages) {
+ if (ow->nfolios) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
- ret = orangefs_writepage_locked(&folio->page, wbc);
+ ret = orangefs_writepage_locked(folio, wbc);
mapping_set_error(folio->mapping, ret);
folio_unlock(folio);
folio_end_writeback(folio);
} else {
- if (ow->npages == ow->maxpages) {
+ if (ow->nfolios == ow->maxpages) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
}
return ret;
@@ -207,31 +182,35 @@ static int orangefs_writepages(struct address_space *mapping,
{
struct orangefs_writepages *ow;
struct blk_plug plug;
- int ret;
+ int error;
+ struct folio *folio = NULL;
+
ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
if (!ow)
return -ENOMEM;
ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
- ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
- if (!ow->pages) {
+ ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL);
+ if (!ow->folios) {
kfree(ow);
return -ENOMEM;
}
ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
if (!ow->bv) {
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow);
return -ENOMEM;
}
+ ow->mapping = mapping;
blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
- if (ow->npages)
- ret = orangefs_writepages_work(ow, wbc);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = orangefs_writepages_callback(folio, wbc, ow);
+ if (ow->nfolios)
+ error = orangefs_writepages_work(ow, wbc);
blk_finish_plug(&plug);
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow->bv);
kfree(ow);
- return ret;
+ return error;
}
static int orangefs_launder_folio(struct folio *);
@@ -484,7 +463,7 @@ static int orangefs_launder_folio(struct folio *folio)
};
folio_wait_writeback(folio);
if (folio_clear_dirty_for_io(folio)) {
- r = orangefs_writepage_locked(&folio->page, &wbc);
+ r = orangefs_writepage_locked(folio, &wbc);
folio_end_writeback(folio);
}
return r;
@@ -606,7 +585,6 @@ out:
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
- .writepage = orangefs_writepage,
.readahead = orangefs_readahead,
.read_folio = orangefs_read_folio,
.writepages = orangefs_writepages,
@@ -616,6 +594,7 @@ static const struct address_space_operations orangefs_address_operations = {
.invalidate_folio = orangefs_invalidate_folio,
.release_folio = orangefs_release_folio,
.free_folio = orangefs_free_folio,
+ .migrate_folio = filemap_migrate_folio,
.launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 200558ec72f0..82395fe2b956 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -300,8 +300,8 @@ out:
return ret;
}
-static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
@@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
if (!new_op)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
new_op->upcall.req.mkdir.parent_refn = parent->refn;
@@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
__orangefs_setattr(dir, &iattr);
out:
op_release(new_op);
- return ret;
+ return ERR_PTR(ret);
}
static int orangefs_rename(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index edcca4beb765..b562d3dbc76b 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -197,18 +197,6 @@ int orangefs_bufmap_size_query(void)
return size;
}
-int orangefs_bufmap_shift_query(void)
-{
- struct orangefs_bufmap *bufmap;
- int shift = 0;
- spin_lock(&orangefs_bufmap_lock);
- bufmap = __orangefs_bufmap;
- if (bufmap)
- shift = bufmap->desc_shift;
- spin_unlock(&orangefs_bufmap_lock);
- return shift;
-}
-
static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
@@ -532,16 +520,3 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
}
return 0;
}
-
-void orangefs_bufmap_page_fill(void *page_to,
- int buffer_index,
- int slot_index)
-{
- struct orangefs_bufmap_desc *from;
- void *page_from;
-
- from = &__orangefs_bufmap->desc_array[buffer_index];
- page_from = kmap_atomic(from->page_array[slot_index]);
- memcpy(page_to, page_from, PAGE_SIZE);
- kunmap_atomic(page_from);
-}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
index 75b2d2833af1..4231175ccdb2 100644
--- a/fs/orangefs/orangefs-bufmap.h
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -10,8 +10,6 @@
int orangefs_bufmap_size_query(void);
-int orangefs_bufmap_shift_query(void);
-
int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
void orangefs_bufmap_finalize(void);
@@ -34,6 +32,5 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
int buffer_index,
size_t size);
-void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index);
#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index 6e079d4230d0..d4463534cec6 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -43,47 +43,4 @@
#define GOSSIP_MAX_NR 16
#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
-/* a private internal type */
-struct __keyword_mask_s {
- const char *keyword;
- __u64 mask_val;
-};
-
-/*
- * Map all kmod keywords to kmod debug masks here. Keep this
- * structure "packed":
- *
- * "all" is always last...
- *
- * keyword mask_val index
- * foo 1 0
- * bar 2 1
- * baz 4 2
- * qux 8 3
- * . . .
- */
-static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
- {"super", GOSSIP_SUPER_DEBUG},
- {"inode", GOSSIP_INODE_DEBUG},
- {"file", GOSSIP_FILE_DEBUG},
- {"dir", GOSSIP_DIR_DEBUG},
- {"utils", GOSSIP_UTILS_DEBUG},
- {"wait", GOSSIP_WAIT_DEBUG},
- {"acl", GOSSIP_ACL_DEBUG},
- {"dcache", GOSSIP_DCACHE_DEBUG},
- {"dev", GOSSIP_DEV_DEBUG},
- {"name", GOSSIP_NAME_DEBUG},
- {"bufmap", GOSSIP_BUFMAP_DEBUG},
- {"cache", GOSSIP_CACHE_DEBUG},
- {"debugfs", GOSSIP_DEBUGFS_DEBUG},
- {"xattr", GOSSIP_XATTR_DEBUG},
- {"init", GOSSIP_INIT_DEBUG},
- {"sysfs", GOSSIP_SYSFS_DEBUG},
- {"none", GOSSIP_NO_DEBUG},
- {"all", GOSSIP_MAX_DEBUG}
-};
-
-static const int num_kmod_keyword_mask_map = (int)
- (ARRAY_SIZE(s_kmod_keyword_mask_map));
-
#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index f52073022fae..f7095c91660c 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -44,6 +44,49 @@
#include "protocol.h"
#include "orangefs-kernel.h"
+/* a private internal type */
+struct __keyword_mask_s {
+ const char *keyword;
+ __u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ * "all" is always last...
+ *
+ * keyword mask_val index
+ * foo 1 0
+ * bar 2 1
+ * baz 4 2
+ * qux 8 3
+ * . . .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+ {"super", GOSSIP_SUPER_DEBUG},
+ {"inode", GOSSIP_INODE_DEBUG},
+ {"file", GOSSIP_FILE_DEBUG},
+ {"dir", GOSSIP_DIR_DEBUG},
+ {"utils", GOSSIP_UTILS_DEBUG},
+ {"wait", GOSSIP_WAIT_DEBUG},
+ {"acl", GOSSIP_ACL_DEBUG},
+ {"dcache", GOSSIP_DCACHE_DEBUG},
+ {"dev", GOSSIP_DEV_DEBUG},
+ {"name", GOSSIP_NAME_DEBUG},
+ {"bufmap", GOSSIP_BUFMAP_DEBUG},
+ {"cache", GOSSIP_CACHE_DEBUG},
+ {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+ {"xattr", GOSSIP_XATTR_DEBUG},
+ {"init", GOSSIP_INIT_DEBUG},
+ {"sysfs", GOSSIP_SYSFS_DEBUG},
+ {"none", GOSSIP_NO_DEBUG},
+ {"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+ (ARRAY_SIZE(s_kmod_keyword_mask_map));
+
#define DEBUG_HELP_STRING_SIZE 4096
#define HELP_STRING_UNINITIALIZED \
"Client Debug Keywords are unknown until the first time\n" \
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index c9993ff66fc2..fe493f3ed6b6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,37 +138,6 @@ kill_whiteout:
goto out;
}
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode)
-{
- int err;
- struct dentry *d, *dentry = *newdentry;
-
- err = ovl_do_mkdir(ofs, dir, dentry, mode);
- if (err)
- return err;
-
- if (likely(!d_unhashed(dentry)))
- return 0;
-
- /*
- * vfs_mkdir() may succeed and leave the dentry passed
- * to it unhashed and negative. If that happens, try to
- * lookup a new hashed and positive dentry.
- */
- d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
- dentry, err);
- return PTR_ERR(d);
- }
- dput(dentry);
- *newdentry = d;
-
- return 0;
-}
-
struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
struct dentry *newdentry, struct ovl_cattr *attr)
{
@@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
case S_IFDIR:
/* mkdir is special... */
- err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
+ newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode);
+ err = PTR_ERR_OR_ZERO(newdentry);
break;
case S_IFCHR:
@@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
}
out:
if (err) {
- dput(newdentry);
+ if (!IS_ERR(newdentry))
+ dput(newdentry);
return ERR_PTR(err);
}
return newdentry;
@@ -282,7 +253,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
* XXX: if we ever use ovl_obtain_alias() to decode directory
* file handles, need to use ovl_get_inode_locked() and
* d_instantiate_new() here to prevent from creating two
- * hashed directory inode aliases.
+ * hashed directory inode aliases. We then need to return
+ * the obtained alias to ovl_mkdir().
*/
inode = ovl_get_inode(dentry->d_sb, &oip);
if (IS_ERR(inode))
@@ -687,10 +659,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir,
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
-static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+ return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL));
}
static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0021e2025020..6f2f8f4cfbbc 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
return err;
}
-static inline int ovl_do_mkdir(struct ovl_fs *ofs,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
- pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
- return err;
+ dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry));
+ return dentry;
}
static inline int ovl_do_mknod(struct ovl_fs *ofs,
@@ -838,8 +839,6 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct ovl_fs *ofs,
struct inode *dir, struct dentry *newdentry,
struct ovl_cattr *attr);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 1115c22deca0..6759f7d040c8 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -59,6 +59,7 @@ enum ovl_opt {
Opt_metacopy,
Opt_verity,
Opt_volatile,
+ Opt_override_creds,
};
static const struct constant_table ovl_parameter_bool[] = {
@@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool),
fsparam_enum("verity", Opt_verity, ovl_parameter_verity),
fsparam_flag("volatile", Opt_volatile),
+ fsparam_flag_no("override_creds", Opt_override_creds),
{}
};
@@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_userxattr:
config->userxattr = true;
break;
+ case Opt_override_creds: {
+ const struct cred *cred = NULL;
+
+ if (result.negated) {
+ swap(cred, ofs->creator_cred);
+ put_cred(cred);
+ break;
+ }
+
+ if (!current_in_userns(fc->user_ns)) {
+ err = -EINVAL;
+ break;
+ }
+
+ cred = prepare_creds();
+ if (cred)
+ swap(cred, ofs->creator_cred);
+ else
+ err = -ENOMEM;
+
+ put_cred(cred);
+ break;
+ }
default:
pr_err("unrecognized mount option \"%s\" or missing value\n",
param->key);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 86ae6f6da36b..b63474d1b064 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -327,9 +327,10 @@ retry:
goto retry;
}
- err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
- if (err)
- goto out_dput;
+ work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
+ err = PTR_ERR(work);
+ if (IS_ERR(work))
+ goto out_err;
/* Weird filesystem returning with hashed negative (kernfs)? */
err = -EINVAL;
@@ -1305,6 +1306,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_fs_context *ctx = fc->fs_private;
+ const struct cred *old_cred = NULL;
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_layer *layers;
@@ -1318,10 +1320,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
- ofs->creator_cred = cred = prepare_creds();
+ if (!ofs->creator_cred)
+ ofs->creator_cred = cred = prepare_creds();
+ else
+ cred = (struct cred *)ofs->creator_cred;
if (!cred)
goto out_err;
+ old_cred = ovl_override_creds(sb);
+
err = ovl_fs_params_verify(ctx, &ofs->config);
if (err)
goto out_err;
@@ -1481,11 +1488,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = root_dentry;
+ ovl_revert_creds(old_cred);
return 0;
out_free_oe:
ovl_free_entry(oe);
out_err:
+ /*
+ * Revert creds before calling ovl_free_fs() which will call
+ * put_cred() and put_cred() requires that the cred's that are
+ * put are not the caller's creds, i.e., current->cred.
+ */
+ if (old_cred)
+ ovl_revert_creds(old_cred);
ovl_free_fs(ofs);
sb->s_fs_info = NULL;
return err;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c0478b3c55d9..d64a4cbeb0da 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -24,6 +24,28 @@
#include "internal.h"
#include "mount.h"
+static struct kmem_cache *pidfs_cachep __ro_after_init;
+
+/*
+ * Stashes information that userspace needs to access even after the
+ * process has been reaped.
+ */
+struct pidfs_exit_info {
+ __u64 cgroupid;
+ __s32 exit_code;
+};
+
+struct pidfs_inode {
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+ struct inode vfs_inode;
+};
+
+static inline struct pidfs_inode *pidfs_i(struct inode *inode)
+{
+ return container_of(inode, struct pidfs_inode, vfs_inode);
+}
+
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -188,36 +210,48 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = pidfd_pid(file);
- bool thread = file->f_flags & PIDFD_THREAD;
struct task_struct *task;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
/*
- * Depending on PIDFD_THREAD, inform pollers when the thread
- * or the whole thread-group exits.
+ * Don't wake waiters if the thread-group leader exited
+ * prematurely. They either get notified when the last subthread
+ * exits or not at all if one of the remaining subthreads execs
+ * and assumes the struct pid of the old thread-group leader.
*/
guard(rcu)();
task = pid_task(pid, PIDTYPE_PID);
if (!task)
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
- else if (task->exit_state && (thread || thread_group_empty(task)))
+ else if (task->exit_state && !delay_group_leader(task))
poll_flags = EPOLLIN | EPOLLRDNORM;
return poll_flags;
}
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static inline bool pid_in_current_pidns(const struct pid *pid)
+{
+ const struct pid_namespace *ns = task_active_pid_ns(current);
+
+ if (ns->level <= pid->level)
+ return pid->numbers[ns->level].ns == ns;
+
+ return false;
+}
+
+static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ struct inode *inode = file_inode(file);
+ struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
+ struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
+ struct task_struct *task;
const struct cred *c;
__u64 mask;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
-#endif
if (!uinfo)
return -EINVAL;
@@ -227,6 +261,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
return -EFAULT;
+ /*
+ * Restrict information retrieval to tasks within the caller's pid
+ * namespace hierarchy.
+ */
+ if (!pid_in_current_pidns(pid))
+ return -ESRCH;
+
+ if (mask & PIDFD_INFO_EXIT) {
+ exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ if (exit_info) {
+ kinfo.mask |= PIDFD_INFO_EXIT;
+#ifdef CONFIG_CGROUPS
+ kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+#endif
+ kinfo.exit_code = exit_info->exit_code;
+ }
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ /*
+ * If the task has already been reaped, only exit
+ * information is available
+ */
+ if (!(mask & PIDFD_INFO_EXIT))
+ return -ESRCH;
+
+ goto copy_out;
+ }
+
c = get_task_cred(task);
if (!c)
return -ESRCH;
@@ -246,11 +311,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
put_cred(c);
#ifdef CONFIG_CGROUPS
- rcu_read_lock();
- cgrp = task_dfl_cgroup(task);
- kinfo.cgroupid = cgroup_id(cgrp);
- kinfo.mask |= PIDFD_INFO_CGROUPID;
- rcu_read_unlock();
+ if (!kinfo.cgroupid) {
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+ }
#endif
/*
@@ -270,16 +339,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
return -ESRCH;
+copy_out:
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
* userspace knows about will be copied. If userspace provides a new
* struct, only the bits that the kernel knows about will be copied.
*/
- if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
- return -EFAULT;
-
- return 0;
+ return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
}
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -317,7 +384,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
- struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
@@ -332,13 +398,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return put_user(file_inode(file)->i_generation, argp);
}
- task = get_pid_task(pid, PIDTYPE_PID);
- if (!task)
- return -ESRCH;
-
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
- return pidfd_info(task, cmd, arg);
+ return pidfd_info(file, cmd, arg);
+
+ task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
if (arg)
return -EINVAL;
@@ -450,6 +516,49 @@ struct pid *pidfd_pid(const struct file *file)
return file_inode(file)->i_private;
}
+/*
+ * We're called from release_task(). We know there's at least one
+ * reference to struct pid being held that won't be released until the
+ * task has been reaped which cannot happen until we're out of
+ * release_task().
+ *
+ * If this struct pid is referred to by a pidfd then
+ * stashed_dentry_get() will return the dentry and inode for that struct
+ * pid. Since we've taken a reference on it there's now an additional
+ * reference from the exit path on it. Which is fine. We're going to put
+ * it again in a second and we know that the pid is kept alive anyway.
+ *
+ * Worst case is that we've filled in the info and immediately free the
+ * dentry and inode afterwards since the pidfd has been closed. Since
+ * pidfs_exit() currently is placed after exit_task_work() we know that
+ * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ */
+void pidfs_exit(struct task_struct *tsk)
+{
+ struct dentry *dentry;
+
+ might_sleep();
+
+ dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
+ if (dentry) {
+ struct inode *inode = d_inode(dentry);
+ struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
+#endif
+ exit_info->exit_code = tsk->exit_code;
+
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
+ dput(dentry);
+ }
+}
+
static struct vfsmount *pidfs_mnt __ro_after_init;
/*
@@ -505,9 +614,30 @@ static void pidfs_evict_inode(struct inode *inode)
put_pid(pid);
}
+static struct inode *pidfs_alloc_inode(struct super_block *sb)
+{
+ struct pidfs_inode *pi;
+
+ pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL);
+ if (!pi)
+ return NULL;
+
+ memset(&pi->__pei, 0, sizeof(pi->__pei));
+ pi->exit_info = NULL;
+
+ return &pi->vfs_inode;
+}
+
+static void pidfs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+}
+
static const struct super_operations pidfs_sops = {
+ .alloc_inode = pidfs_alloc_inode,
.drop_inode = generic_delete_inode,
.evict_inode = pidfs_evict_inode,
+ .free_inode = pidfs_free_inode,
.statfs = simple_statfs,
};
@@ -633,8 +763,49 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
return 0;
}
+static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
+ unsigned int flags)
+{
+ enum pid_type type;
+
+ if (flags & PIDFD_CLONE)
+ return true;
+
+ /*
+ * Make sure that if a pidfd is created PIDFD_INFO_EXIT
+ * information will be available. So after an inode for the
+ * pidfd has been allocated perform another check that the pid
+ * is still alive. If it is exit information is available even
+ * if the task gets reaped before the pidfd is returned to
+ * userspace. The only exception is PIDFD_CLONE where no task
+ * linkage has been established for @pid yet and the kernel is
+ * in the middle of process creation so there's nothing for
+ * pidfs to miss.
+ */
+ if (flags & PIDFD_THREAD)
+ type = PIDTYPE_PID;
+ else
+ type = PIDTYPE_TGID;
+
+ /*
+ * Since pidfs_exit() is called before struct pid's task linkage
+ * is removed the case where the task got reaped but a dentry
+ * was already attached to struct pid and exit information was
+ * recorded and published can be handled correctly.
+ */
+ if (unlikely(!pid_has_task(pid, type))) {
+ struct inode *inode = d_inode(path->dentry);
+ return !!READ_ONCE(pidfs_i(inode)->exit_info);
+ }
+
+ return true;
+}
+
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
+ if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags))
+ return ERR_PTR(-ESRCH);
+
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
@@ -698,22 +869,46 @@ static struct file_system_type pidfs_type = {
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
{
-
struct file *pidfd_file;
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
+ /*
+ * Ensure that PIDFD_CLONE can be passed as a flag without
+ * overloading other uapi pidfd flags.
+ */
+ BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
+ BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
return ERR_PTR(ret);
+ if (!pidfs_pid_valid(pid, &path, flags))
+ return ERR_PTR(-ESRCH);
+
+ flags &= ~PIDFD_CLONE;
pidfd_file = dentry_open(&path, flags, current_cred());
- path_put(&path);
+ /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ if (!IS_ERR(pidfd_file))
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
return pidfd_file;
}
+static void pidfs_inode_init_once(void *data)
+{
+ struct pidfs_inode *pi = data;
+
+ inode_init_once(&pi->vfs_inode);
+}
+
void __init pidfs_init(void)
{
+ pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC),
+ pidfs_inode_init_once);
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
diff --git a/fs/pipe.c b/fs/pipe.c
index 4d0799e4e719..da45edd68c41 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
pipe_lock(pipe2);
}
+static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe)
+{
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i]) {
+ struct page *page = pipe->tmp_page[i];
+ pipe->tmp_page[i] = NULL;
+ return page;
+ }
+ }
+
+ return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
+}
+
+static void anon_pipe_put_page(struct pipe_inode_info *pipe,
+ struct page *page)
+{
+ if (page_count(page) == 1) {
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (!pipe->tmp_page[i]) {
+ pipe->tmp_page[i] = page;
+ return;
+ }
+ }
+ }
+
+ put_page(page);
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
- /*
- * If nobody else uses this page, and we don't already have a
- * temporary page, let's keep track of it as a one-deep
- * allocation cache. (Otherwise just release our reference to it)
- */
- if (page_count(page) == 1 && !pipe->tmp_page)
- pipe->tmp_page = page;
- else
- put_page(page);
+ anon_pipe_put_page(pipe, page);
}
static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
@@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
}
static ssize_t
-pipe_read(struct kiocb *iocb, struct iov_iter *to)
+anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
@@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
/* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->note_loss) {
@@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
#endif
if (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t chars = buf->len;
size_t written;
int error;
@@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
break;
}
mutex_unlock(&pipe->mutex);
-
/*
* We only get here if we didn't actually read anything.
*
- * However, we could have seen (and removed) a zero-sized
- * pipe buffer, and might have made space in the buffers
- * that way.
- *
- * You can't make zero-sized pipe buffers by doing an empty
- * write (not even in packet mode), but they can happen if
- * the writer gets an EFAULT when trying to fill a buffer
- * that already got allocated and inserted in the buffer
- * array.
- *
- * So we still need to wake up any pending writers in the
- * _very_ unlikely case that the pipe was full, but we got
- * no data.
- */
- if (unlikely(wake_writer))
- wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-
- /*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
@@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
- wake_writer = false;
wake_next_reader = true;
mutex_lock(&pipe->mutex);
}
@@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ int ret = anon_pipe_read(iocb, to);
if (ret > 0)
- file_accessed(filp);
+ file_accessed(iocb->ki_filp);
return ret;
}
@@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe)
}
static ssize_t
-pipe_write(struct kiocb *iocb, struct iov_iter *from)
+anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
@@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
int offset = buf->offset + buf->len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
@@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
- unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf;
- struct page *page = pipe->tmp_page;
+ struct page *page;
int copied;
- if (!page) {
- page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
- if (unlikely(!page)) {
- ret = ret ? : -ENOMEM;
- break;
- }
- pipe->tmp_page = page;
+ page = anon_pipe_get_page(pipe);
+ if (unlikely(!page)) {
+ if (!ret)
+ ret = -ENOMEM;
+ break;
}
- /* Allocate a slot in the ring in advance and attach an
- * empty buffer. If we fault or otherwise fail to use
- * it, either the reader will consume it or it'll still
- * be there for the next write.
- */
- pipe->head = head + 1;
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ anon_pipe_put_page(pipe, page);
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ pipe->head = head + 1;
/* Insert it into the buffer array */
- buf = &pipe->bufs[head & mask];
+ buf = pipe_buf(pipe, head);
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
- pipe->tmp_page = NULL;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += copied;
buf->len = copied;
+ ret += copied;
if (!iov_iter_count(from))
break;
- }
- if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
+ }
/* Wait for buffer space to become available. */
if ((filp->f_flags & O_NONBLOCK) ||
@@ -602,11 +596,21 @@ out:
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
- int err = file_update_time(filp);
- if (err)
- ret = err;
- sb_end_write(file_inode(filp)->i_sb);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ int ret = anon_pipe_write(iocb, from);
+ if (ret > 0) {
+ struct file *filp = iocb->ki_filp;
+ if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
+ int err = file_update_time(filp);
+ if (err)
+ ret = err;
+ sb_end_write(file_inode(filp)->i_sb);
+ }
}
return ret;
}
@@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (pipe->watch_queue)
put_watch_queue(pipe->watch_queue);
#endif
- if (pipe->tmp_page)
- __free_page(pipe->tmp_page);
+ for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i])
+ __free_page(pipe->tmp_page[i]);
+ }
kfree(pipe->bufs);
kfree(pipe);
}
@@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = {
.d_dname = pipefs_dname,
};
+static const struct file_operations pipeanon_fops;
+
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
@@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void)
inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
- inode->i_fop = &pipefifo_fops;
+ inode->i_fop = &pipeanon_fops;
/*
* Mark the inode dirty from the very beginning,
@@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags)
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
@@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags)
f->f_pipe = 0;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
@@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe)
static int fifo_open(struct inode *inode, struct file *filp)
{
+ bool is_pipe = inode->i_fop == &pipeanon_fops;
struct pipe_inode_info *pipe;
- bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
int ret;
filp->f_pipe = 0;
@@ -1234,8 +1242,19 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
- .read_iter = pipe_read,
- .write_iter = pipe_write,
+ .read_iter = fifo_pipe_read,
+ .write_iter = fifo_pipe_write,
+ .poll = pipe_poll,
+ .unlocked_ioctl = pipe_ioctl,
+ .release = pipe_release,
+ .fasync = pipe_fasync,
+ .splice_write = iter_file_splice_write,
+};
+
+static const struct file_operations pipeanon_fops = {
+ .open = fifo_open,
+ .read_iter = anon_pipe_read,
+ .write_iter = anon_pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
@@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
+ /* nr_slots larger than limits of pipe->{head,tail} */
+ if (unlikely(nr_slots > (pipe_index_t)-1u))
+ return -EINVAL;
+
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
@@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
struct pipe_inode_info *pipe = file->private_data;
- if (file->f_op != &pipefifo_fops || !pipe)
+ if (!pipe)
+ return NULL;
+ if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
return NULL;
if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
diff --git a/fs/pnode.c b/fs/pnode.c
index ef048f008bdd..7a062a5de10e 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
while (1) {
@@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m,
* Advance m such that propagation_next will not return
* the slaves of m.
*/
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
m = last_slave(m);
return m;
@@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
while (1) {
while (1) {
struct mount *next;
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
@@ -226,7 +226,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
struct mount *child;
int type;
/* skip ones added by this propagate_mnt() */
- if (IS_MNT_NEW(m))
+ if (IS_MNT_PROPAGATED(m))
return 0;
/* skip if mountpoint isn't covered by it */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
@@ -380,7 +380,7 @@ bool propagation_would_overmount(const struct mount *from,
if (!IS_MNT_SHARED(from))
return false;
- if (IS_MNT_NEW(to))
+ if (IS_MNT_PROPAGATED(to))
return false;
if (to->mnt.mnt_root != mp->m_dentry)
@@ -549,8 +549,10 @@ static void restore_mounts(struct list_head *to_restore)
mp = parent->mnt_mp;
parent = parent->mnt_parent;
}
- if (parent != mnt->mnt_parent)
+ if (parent != mnt->mnt_parent) {
mnt_change_mountpoint(parent, mp, mnt);
+ mnt_notify_add(mnt);
+ }
}
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 0b02a6393891..ddafe0d087ca 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,7 @@
#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
+#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING))
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd89e956c322..5538c4aee8fa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
@@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq.info.si_signo,
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
timer->sigq.info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
@@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1cb33771bf9f..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -34,8 +34,6 @@
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -122,7 +120,9 @@ static void update_kcore_size(void)
kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
kcore_notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
VMCOREINFO_NOTE_NAME_BYTES +
ALIGN(sizeof(struct elf_prstatus), 4) +
ALIGN(sizeof(struct elf_prpsinfo), 4) +
@@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
- append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
sizeof(prstatus));
- append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
sizeof(prpsinfo));
- append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
arch_task_struct_size);
/*
* vmcoreinfo_size is mostly constant after init time, but it
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 56815799ce79..bb3b769edc71 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -14,10 +14,10 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/string.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/ramfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/magic.h>
#include <linux/pstore.h>
@@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb)
}
enum {
- Opt_kmsg_bytes, Opt_err
+ Opt_kmsg_bytes
};
-static const match_table_t tokens = {
- {Opt_kmsg_bytes, "kmsg_bytes=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec pstore_param_spec[] = {
+ fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes),
+ {}
};
-static void parse_options(char *options)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return;
+struct pstore_context {
+ unsigned int kmsg_bytes;
+};
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct pstore_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- if (!*p)
- continue;
+ opt = fs_parse(fc, pstore_param_spec, param, &result);
+ /* pstore has historically ignored invalid kmsg_bytes param */
+ if (opt < 0)
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_kmsg_bytes:
- if (!match_int(&args[0], &option))
- pstore_set_kmsg_bytes(option);
- break;
- }
+ switch (opt) {
+ case Opt_kmsg_bytes:
+ ctx->kmsg_bytes = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
+
+ return 0;
}
/*
@@ -265,14 +266,16 @@ static void parse_options(char *options)
static int pstore_show_options(struct seq_file *m, struct dentry *root)
{
if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES)
- seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes);
+ seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes);
return 0;
}
-static int pstore_remount(struct super_block *sb, int *flags, char *data)
+static int pstore_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- parse_options(data);
+ struct pstore_context *ctx = fc->fs_private;
+
+ sync_filesystem(fc->root->d_sb);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
return 0;
}
@@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = pstore_evict_inode,
- .remount_fs = pstore_remount,
.show_options = pstore_show_options,
};
@@ -406,8 +408,9 @@ void pstore_get_records(int quiet)
inode_unlock(d_inode(root));
}
-static int pstore_fill_super(struct super_block *sb, void *data, int silent)
+static int pstore_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct pstore_context *ctx = fc->fs_private;
struct inode *inode;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &pstore_ops;
sb->s_time_gran = 1;
- parse_options(data);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
inode = pstore_get_inode(sb);
if (inode) {
@@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
-static struct dentry *pstore_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int pstore_get_tree(struct fs_context *fc)
+{
+ if (fc->root)
+ return pstore_reconfigure(fc);
+
+ return get_tree_single(fc, pstore_fill_super);
+}
+
+static void pstore_free_fc(struct fs_context *fc)
{
- return mount_single(fs_type, flags, data, pstore_fill_super);
+ kfree(fc->fs_private);
}
+static const struct fs_context_operations pstore_context_ops = {
+ .parse_param = pstore_parse_param,
+ .get_tree = pstore_get_tree,
+ .reconfigure = pstore_reconfigure,
+ .free = pstore_free_fc,
+};
+
static void pstore_kill_sb(struct super_block *sb)
{
guard(mutex)(&pstore_sb_lock);
@@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb)
INIT_LIST_HEAD(&records_list);
}
+static int pstore_init_fs_context(struct fs_context *fc)
+{
+ struct pstore_context *ctx;
+
+ ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ /*
+ * Global kmsg_bytes is initialized to default, and updated
+ * every time we (re)mount the single-sb filesystem with the
+ * option specified.
+ */
+ ctx->kmsg_bytes = kmsg_bytes;
+
+ fc->fs_private = ctx;
+ fc->ops = &pstore_context_ops;
+
+ return 0;
+}
+
static struct file_system_type pstore_fs_type = {
.owner = THIS_MODULE,
.name = "pstore",
- .mount = pstore_mount,
.kill_sb = pstore_kill_sb,
+ .init_fs_context = pstore_init_fs_context,
+ .parameters = pstore_param_spec,
};
int __init pstore_init_fs(void)
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 801d6c0b170c..a0fc51196910 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -6,7 +6,7 @@
#include <linux/time.h>
#include <linux/pstore.h>
-extern unsigned long kmsg_bytes;
+extern unsigned int kmsg_bytes;
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
@@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {}
extern struct pstore_info *psinfo;
-extern void pstore_set_kmsg_bytes(int);
+extern void pstore_set_kmsg_bytes(unsigned int bytes);
extern void pstore_get_records(int);
extern void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f56b066ab80c..557cf9d40177 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -92,8 +92,8 @@ module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
/* How much of the kernel log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-module_param(kmsg_bytes, ulong, 0444);
+unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, uint, 0444);
MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
static void *compress_workspace;
@@ -107,9 +107,9 @@ static void *compress_workspace;
static char *big_oops_buf;
static size_t max_compressed_size;
-void pstore_set_kmsg_bytes(int bytes)
+void pstore_set_kmsg_bytes(unsigned int bytes)
{
- kmsg_bytes = bytes;
+ WRITE_ONCE(kmsg_bytes, bytes);
}
/* Tag each group of saved records with a sequence number */
@@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
struct kmsg_dump_detail *detail)
{
struct kmsg_dump_iter iter;
+ unsigned int remaining = READ_ONCE(kmsg_bytes);
unsigned long total = 0;
const char *why;
unsigned int part = 1;
@@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
kmsg_dump_rewind(&iter);
oopscount++;
- while (total < kmsg_bytes) {
+ while (total < remaining) {
char *dst;
size_t dst_size;
int header_size;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8006faaaf0ec..775fa905fda0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -119,13 +119,13 @@ out:
return error;
}
-static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/read_write.c b/fs/read_write.c
index a6133241dfb8..bb0ed26a0b3a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
if (whence == SEEK_CUR) {
/*
- * f_lock protects against read/modify/write race with
- * other SEEK_CURs. Note that parallel writes and reads
- * behave like SEEK_SET.
+ * If the file requires locking via f_pos_lock we know
+ * that mutual exclusion for SEEK_CUR on the same file
+ * is guaranteed. If the file isn't locked, we take
+ * f_lock to protect against f_pos races with other
+ * SEEK_CURs.
*/
- guard(spinlock)(&file->f_lock);
+ if (file_seek_cur_needs_f_lock(file)) {
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
return vfs_setpos(file, file->f_pos + offset, maxsize);
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d1a5f43ce466..d469782f97f4 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
return ufd;
}
- file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx,
- O_RDWR | (flags & O_NONBLOCK));
+ file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops,
+ ctx, O_RDWR | (flags & O_NONBLOCK),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
-
fd_install(ufd, file);
} else {
CLASS(fd, f)(ufd);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 831fee962c4d..8dea0cf3a8de 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, dev_t);
-extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename2(struct mnt_idmap *, struct inode *,
struct dentry *, struct inode *, struct dentry *,
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 8582cf61242c..9e4f7378f30f 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
spin_unlock(&tcon->tc_lock);
/*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * BB Add call to evict_inodes(sb) for all superblocks mounted
* to this tcon.
*/
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 616149c7f0a5..3bb21aa58474 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2207,8 +2207,8 @@ posix_mkdir_get_info:
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
- struct dentry *direntry, umode_t mode)
+struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
+ struct dentry *direntry, umode_t mode)
{
int rc = 0;
unsigned int xid;
@@ -2224,10 +2224,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
cifs_sb = CIFS_SB(inode->i_sb);
if (unlikely(cifs_forced_shutdown(cifs_sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
- return PTR_ERR(tlink);
+ return ERR_CAST(tlink);
tcon = tlink_tcon(tlink);
xid = get_xid();
@@ -2283,7 +2283,7 @@ mkdir_out:
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
- return rc;
+ return ERR_PTR(rc);
}
int cifs_rmdir(struct inode *inode, struct dentry *direntry)
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 6890016e1923..8554aa5a1059 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -113,11 +113,6 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
if (IS_ERR(d))
goto err_out;
- if (d_is_negative(d)) {
- dput(d);
- goto err_out;
- }
-
path->dentry = d;
path->mnt = mntget(parent_path->mnt);
@@ -211,8 +206,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
{
struct mnt_idmap *idmap;
struct path path;
- struct dentry *dentry;
- int err;
+ struct dentry *dentry, *d;
+ int err = 0;
dentry = ksmbd_vfs_kern_path_create(work, name,
LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
@@ -227,27 +222,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
- err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
- if (!err && d_unhashed(dentry)) {
- struct dentry *d;
-
- d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- err = PTR_ERR(d);
- goto out_err;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = -ENOENT;
- goto out_err;
- }
-
- ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
- dput(d);
- }
+ d = dentry;
+ dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
+ if (IS_ERR(dentry))
+ err = PTR_ERR(dentry);
+ else if (d_is_negative(dentry))
+ err = -ENOENT;
+ if (!err && dentry != d)
+ ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry));
-out_err:
done_path_create(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -693,6 +676,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
struct ksmbd_file *parent_fp;
int new_type;
int err, lookup_flags = LOOKUP_NO_SYMLINKS;
+ int target_lookup_flags = LOOKUP_RENAME_TARGET;
if (ksmbd_override_fsids(work))
return -ENOMEM;
@@ -703,6 +687,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
goto revert_fsids;
}
+ /*
+ * explicitly handle file overwrite case, for compatibility with
+ * filesystems that may not support rename flags (e.g: fuse)
+ */
+ if (flags & RENAME_NOREPLACE)
+ target_lookup_flags |= LOOKUP_EXCL;
+ flags &= ~(RENAME_NOREPLACE);
+
retry:
err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
&new_path, &new_last, &new_type,
@@ -743,7 +735,7 @@ retry:
}
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
- lookup_flags | LOOKUP_RENAME_TARGET);
+ lookup_flags | target_lookup_flags);
if (IS_ERR(new_dentry)) {
err = PTR_ERR(new_dentry);
goto out3;
@@ -754,16 +746,6 @@ retry:
goto out4;
}
- /*
- * explicitly handle file overwrite case, for compatibility with
- * filesystems that may not support rename flags (e.g: fuse)
- */
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
- err = -EEXIST;
- goto out4;
- }
- flags &= ~(RENAME_NOREPLACE);
-
if (old_child == trap) {
err = -EINVAL;
goto out4;
diff --git a/fs/splice.c b/fs/splice.c
index 23fa5561b944..90d464241f15 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
unsigned int tail = pipe->tail;
unsigned int head = pipe->head;
- unsigned int mask = pipe->ring_size - 1;
ssize_t ret = 0;
int page_nr = 0;
@@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
}
while (!pipe_full(head, tail, pipe->max_usage)) {
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head);
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
@@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
} else if (pipe_full(head, tail, pipe->max_usage)) {
ret = -EAGAIN;
} else {
- pipe->bufs[head & mask] = *buf;
+ *pipe_buf(pipe, head) = *buf;
pipe->head = head + 1;
return buf->len;
}
@@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (unlikely(!buf->len)) {
pipe_buf_release(pipe, buf);
@@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
while (sd.total_len) {
struct kiocb kiocb;
struct iov_iter from;
- unsigned int head, tail, mask;
+ unsigned int head, tail;
size_t left;
int n;
@@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
/* build the vector */
left = sd.total_len;
for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t this_len = buf->len;
/* zero-length bvecs are not supported, skip them */
@@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* dismiss the fully eaten buffers, adjust the partial one */
tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
@@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
pipe_lock(pipe);
while (len > 0) {
- unsigned int head, tail, mask, bc = 0;
+ unsigned int head, tail, bc = 0;
size_t remain = len;
/*
@@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg;
if (!buf->len) {
@@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
len -= ret;
tail = pipe->tail;
while (ret > 0) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg = min_t(size_t, ret, buf->len);
buf->offset += seg;
@@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
int ret = 0;
bool input_wakeup = false;
@@ -1747,9 +1740,7 @@ retry:
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
size_t o_len;
@@ -1792,8 +1783,8 @@ retry:
goto retry;
}
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
if (len >= ibuf->len) {
/*
@@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
ssize_t ret = 0;
/*
@@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
if (!opipe->readers) {
@@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_full(o_head, o_tail, opipe->max_usage))
break;
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
/*
* Get a reference to this pipe buffer,
diff --git a/fs/super.c b/fs/super.c
index 5a7db4a556e3..97a17f9d9023 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
- invalidate_inodes(sb);
+ evict_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
@@ -1737,61 +1737,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
-int reconfigure_single(struct super_block *s,
- int flags, void *data)
-{
- struct fs_context *fc;
- int ret;
-
- /* The caller really need to be passing fc down into mount_single(),
- * then a chunk of this can be removed. [Bollocks -- AV]
- * Better yet, reconfiguration shouldn't happen, but rather the second
- * mount should be rejected if the parameters are not compatible.
- */
- fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- ret = parse_monolithic_mount_data(fc, data);
- if (ret < 0)
- goto out;
-
- ret = reconfigure_super(fc);
-out:
- put_fs_context(fc);
- return ret;
-}
-
-static int compare_single(struct super_block *s, void *p)
-{
- return 1;
-}
-
-struct dentry *mount_single(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- struct super_block *s;
- int error;
-
- s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
- if (!s->s_root) {
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (!error)
- s->s_flags |= SB_ACTIVE;
- } else {
- error = reconfigure_single(s, flags, data);
- }
- if (unlikely(error)) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_single);
-
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
deleted file mode 100644
index 67b3f90afbfd..000000000000
--- a/fs/sysv/Kconfig
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config SYSV_FS
- tristate "System V/Xenix/V7/Coherent file system support"
- depends on BLOCK
- select BUFFER_HEAD
- help
- SCO, Xenix and Coherent are commercial Unix systems for Intel
- machines, and Version 7 was used on the DEC PDP-11. Saying Y
- here would allow you to read from their floppies and hard disk
- partitions.
-
- If you have floppies or hard disk partitions like that, it is likely
- that they contain binaries from those other Unix systems; in order
- to run these binaries, you will want to install linux-abi which is
- a set of kernel modules that lets you run SCO, Xenix, Wyse,
- UnixWare, Dell Unix and System V programs under Linux. It is
- available via FTP (user: ftp) from
- <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
- NOTE: that will work only for binaries from Intel-based systems;
- PDP ones will have to wait until somebody ports Linux to -11 ;-)
-
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the System V file system support
- (but you need NFS file system support obviously).
-
- Note that this option is generally not needed for floppies, since a
- good portable way to transport files and directories between unixes
- (and even other operating systems) is given by the tar program ("man
- tar" or preferably "info tar"). Note also that this option has
- nothing whatsoever to do with the option "System V IPC". Read about
- the System V file system in
- <file:Documentation/filesystems/sysv-fs.rst>.
- Saying Y here will enlarge your kernel by about 27 KB.
-
- To compile this as a module, choose M here: the module will be called
- sysv.
-
- If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
deleted file mode 100644
index 17d12ba04b18..000000000000
--- a/fs/sysv/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the Linux SystemV/Coherent filesystem routines.
-#
-
-obj-$(CONFIG_SYSV_FS) += sysv.o
-
-sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
- namei.o super.o
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
deleted file mode 100644
index 0e69dbdf7277..000000000000
--- a/fs/sysv/balloc.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/balloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/balloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing blocks.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tfree = *sb->sv_free_blocks
- but we nevertheless keep it up to date. */
-
-static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh)
-{
- char *bh_data = bh->b_data;
-
- if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4)
- return (sysv_zone_t*)(bh_data+4);
- else
- return (sysv_zone_t*)(bh_data+2);
-}
-
-/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */
-
-void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- sysv_zone_t *blocks = sbi->s_bcache;
- unsigned count;
- unsigned block = fs32_to_cpu(sbi, nr);
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only no one
- * should call this for an AFS filesystem anyway...
- */
- if (sbi->s_type == FSTYPE_AFS)
- return;
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_free_block: trying to free block not in datazone\n");
- return;
- }
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count > sbi->s_flc_size) {
- printk("sysv_free_block: flc_count > flc_size\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- /* If the free list head in super-block is full, it is copied
- * into this block being freed, ditto if it's completely empty
- * (applies only on Coherent).
- */
- if (count == sbi->s_flc_size || count == 0) {
- block += sbi->s_block_base;
- bh = sb_getblk(sb, block);
- if (!bh) {
- printk("sysv_free_block: getblk() failed\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- memset(bh->b_data, 0, sb->s_blocksize);
- *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count);
- memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t));
- mark_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- brelse(bh);
- count = 0;
- }
- sbi->s_bcache[count++] = nr;
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- fs32_add(sbi, sbi->s_free_blocks, 1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
-}
-
-sysv_zone_t sysv_new_block(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int block;
- sysv_zone_t nr;
- struct buffer_head * bh;
- unsigned count;
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count == 0) /* Applies only to Coherent FS */
- goto Enospc;
- nr = sbi->s_bcache[--count];
- if (nr == 0) /* Applies only to Xenix FS, SystemV FS */
- goto Enospc;
-
- block = fs32_to_cpu(sbi, nr);
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_new_block: new block %d is not in data zone\n",
- block);
- goto Enospc;
- }
-
- if (count == 0) { /* the last block continues the free list */
- unsigned count;
-
- block += sbi->s_block_base;
- if (!(bh = sb_bread(sb, block))) {
- printk("sysv_new_block: cannot read free-list block\n");
- /* retry this same block next time */
- *sbi->s_bcache_count = cpu_to_fs16(sbi, 1);
- goto Enospc;
- }
- count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- if (count > sbi->s_flc_size) {
- printk("sysv_new_block: free-list block with >flc_size entries\n");
- brelse(bh);
- goto Enospc;
- }
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- memcpy(sbi->s_bcache, get_chunk(sb, bh),
- count * sizeof(sysv_zone_t));
- brelse(bh);
- }
- /* Now the free list head in the superblock is valid again. */
- fs32_add(sbi, sbi->s_free_blocks, -1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
- return nr;
-
-Enospc:
- mutex_unlock(&sbi->s_lock);
- return 0;
-}
-
-unsigned long sysv_count_free_blocks(struct super_block * sb)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- int sb_count;
- int count;
- struct buffer_head * bh = NULL;
- sysv_zone_t *blocks;
- unsigned block;
- int n;
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only we just
- * lie and say it has no free block at all.
- */
- if (sbi->s_type == FSTYPE_AFS)
- return 0;
-
- mutex_lock(&sbi->s_lock);
- sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- n = fs16_to_cpu(sbi, *sbi->s_bcache_count);
- blocks = sbi->s_bcache;
- while (1) {
- sysv_zone_t zone;
- if (n > sbi->s_flc_size)
- goto E2big;
- zone = 0;
- while (n && (zone = blocks[--n]) != 0)
- count++;
- if (zone == 0)
- break;
-
- block = fs32_to_cpu(sbi, zone);
- if (bh)
- brelse(bh);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones)
- goto Einval;
- block += sbi->s_block_base;
- bh = sb_bread(sb, block);
- if (!bh)
- goto Eio;
- n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- blocks = get_chunk(sb, bh);
- }
- if (bh)
- brelse(bh);
- if (count != sb_count)
- goto Ecount;
-done:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_blocks: new block %d is not in data zone\n",
- block);
- goto trust_sb;
-Eio:
- printk("sysv_count_free_blocks: cannot read free-list block\n");
- goto trust_sb;
-E2big:
- printk("sysv_count_free_blocks: >flc_size entries in free-list block\n");
- if (bh)
- brelse(bh);
-trust_sb:
- count = sb_count;
- goto done;
-Ecount:
- printk("sysv_count_free_blocks: free block count was %d, "
- "correcting to %d\n", sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_free_blocks = cpu_to_fs32(sbi, count);
- dirty_sb(sb);
- }
- goto done;
-}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
deleted file mode 100644
index 639307e2ff8c..000000000000
--- a/fs/sysv/dir.c
+++ /dev/null
@@ -1,378 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/dir.c
- *
- * minix/dir.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/dir.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/dir.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent directory handling functions
- */
-
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include "sysv.h"
-
-static int sysv_readdir(struct file *, struct dir_context *);
-
-const struct file_operations sysv_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = sysv_readdir,
- .fsync = generic_file_fsync,
-};
-
-static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *dir = mapping->host;
-
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- folio_unlock(folio);
-}
-
-static int sysv_handle_dirsync(struct inode *dir)
-{
- int err;
-
- err = filemap_write_and_wait(dir->i_mapping);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- return err;
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio()
- * and must be treated accordingly for nesting purposes.
- */
-static void *dir_get_folio(struct inode *dir, unsigned long n,
- struct folio **foliop)
-{
- struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
-
- if (IS_ERR(folio))
- return ERR_CAST(folio);
- *foliop = folio;
- return kmap_local_folio(folio, 0);
-}
-
-static int sysv_readdir(struct file *file, struct dir_context *ctx)
-{
- unsigned long pos = ctx->pos;
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- unsigned long npages = dir_pages(inode);
- unsigned offset;
- unsigned long n;
-
- ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
- if (pos >= inode->i_size)
- return 0;
-
- offset = pos & ~PAGE_MASK;
- n = pos >> PAGE_SHIFT;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct sysv_dir_entry *de;
- struct folio *folio;
-
- kaddr = dir_get_folio(inode, n, &folio);
- if (IS_ERR(kaddr))
- continue;
- de = (struct sysv_dir_entry *)(kaddr+offset);
- limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
- for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
- char *name = de->name;
-
- if (!de->inode)
- continue;
-
- if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
- fs16_to_cpu(SYSV_SB(sb), de->inode),
- DT_UNKNOWN)) {
- folio_release_kmap(folio, kaddr);
- return 0;
- }
- }
- folio_release_kmap(folio, kaddr);
- }
- return 0;
-}
-
-/* compare strings: name[0..len-1] (not zero-terminated) and
- * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1])
- */
-static inline int namecompare(int len, int maxlen,
- const char * name, const char * buffer)
-{
- if (len < maxlen && buffer[len])
- return 0;
- return !memcmp(name, buffer, len);
-}
-
-/*
- * sysv_find_entry()
- *
- * finds an entry in the specified directory with the wanted name.
- * It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- *
- * On Success folio_release_kmap() should be called on *foliop.
- *
- * sysv_find_entry() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop)
-{
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct inode * dir = d_inode(dentry->d_parent);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct sysv_dir_entry *de;
-
- start = SYSV_I(dir)->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
-
- do {
- char *kaddr = dir_get_folio(dir, n, foliop);
-
- if (!IS_ERR(kaddr)) {
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(*foliop) - SYSV_DIRSIZE;
- for ( ; (char *) de <= kaddr ; de++) {
- if (!de->inode)
- continue;
- if (namecompare(namelen, SYSV_NAMELEN,
- name, de->name))
- goto found;
- }
- folio_release_kmap(*foliop, kaddr);
- }
-
- if (++n >= npages)
- n = 0;
- } while (n != start);
-
- return NULL;
-
-found:
- SYSV_I(dir)->i_dir_start_lookup = n;
- return de;
-}
-
-int sysv_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct folio *folio = NULL;
- struct sysv_dir_entry * de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- /* We take care of directory expansion in the same loop */
- for (n = 0; n <= npages; n++) {
- kaddr = dir_get_folio(dir, n, &folio);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - SYSV_DIRSIZE;
- while ((char *)de <= kaddr) {
- if (!de->inode)
- goto got_it;
- err = -EEXIST;
- if (namecompare(namelen, SYSV_NAMELEN, name, de->name))
- goto out_folio;
- de++;
- }
- folio_release_kmap(folio, kaddr);
- }
- BUG();
- return -EINVAL;
-
-got_it:
- pos = folio_pos(folio) + offset_in_folio(folio, de);
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err)
- goto out_unlock;
- memcpy (de->name, name, namelen);
- memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- err = sysv_handle_dirsync(dir);
-out_folio:
- folio_release_kmap(folio, kaddr);
- return err;
-out_unlock:
- folio_unlock(folio);
- goto out_folio;
-}
-
-int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio)
-{
- struct inode *inode = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = 0;
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
- return sysv_handle_dirsync(inode);
-}
-
-int sysv_make_empty(struct inode *inode, struct inode *dir)
-{
- struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
- struct sysv_dir_entry * de;
- char *kaddr;
- int err;
-
- if (IS_ERR(folio))
- return PTR_ERR(folio);
- err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- goto fail;
- }
- kaddr = kmap_local_folio(folio, 0);
- memset(kaddr, 0, folio_size(folio));
-
- de = (struct sysv_dir_entry *)kaddr;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- strcpy(de->name,".");
- de++;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
- strcpy(de->name,"..");
-
- kunmap_local(kaddr);
- dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- err = sysv_handle_dirsync(inode);
-fail:
- folio_put(folio);
- return err;
-}
-
-/*
- * routine to check that the specified directory is empty (for rmdir)
- */
-int sysv_empty_dir(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct folio *folio = NULL;
- unsigned long i, npages = dir_pages(inode);
- char *kaddr;
-
- for (i = 0; i < npages; i++) {
- struct sysv_dir_entry *de;
-
- kaddr = dir_get_folio(inode, i, &folio);
- if (IS_ERR(kaddr))
- continue;
-
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(folio) - SYSV_DIRSIZE;
-
- for ( ;(char *)de <= kaddr; de++) {
- if (!de->inode)
- continue;
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (!de->name[1]) {
- if (de->inode == cpu_to_fs16(SYSV_SB(sb),
- inode->i_ino))
- continue;
- goto not_empty;
- }
- if (de->name[1] != '.' || de->name[2])
- goto not_empty;
- }
- folio_release_kmap(folio, kaddr);
- }
- return 1;
-
-not_empty:
- folio_release_kmap(folio, kaddr);
- return 0;
-}
-
-/* Releases the page */
-int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio,
- struct inode *inode)
-{
- struct inode *dir = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- return sysv_handle_dirsync(inode);
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * sysv_dotdot() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop)
-{
- struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop);
-
- if (IS_ERR(de))
- return NULL;
- /* ".." is the second directory entry */
- return de + 1;
-}
-
-ino_t sysv_inode_by_name(struct dentry *dentry)
-{
- struct folio *folio;
- struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio);
- ino_t res = 0;
-
- if (de) {
- res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
- folio_release_kmap(folio, de);
- }
- return res;
-}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
deleted file mode 100644
index c645f60bdb7f..000000000000
--- a/fs/sysv/file.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/file.c
- *
- * minix/file.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/file.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/file.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent regular file handling primitives
- */
-
-#include "sysv.h"
-
-/*
- * We have mostly NULLs here: the current defaults are OK for
- * the coh filesystem.
- */
-const struct file_operations sysv_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
- .splice_read = filemap_splice_read,
-};
-
-static int sysv_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (error)
- return error;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = inode_newsize_ok(inode, attr->ia_size);
- if (error)
- return error;
- truncate_setsize(inode, attr->ia_size);
- sysv_truncate(inode);
- }
-
- setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations sysv_file_inode_operations = {
- .setattr = sysv_setattr,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
deleted file mode 100644
index 269df6d49815..000000000000
--- a/fs/sysv/ialloc.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/ialloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/ialloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing inodes.
- */
-
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes
- but we nevertheless keep it up to date. */
-
-/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */
-
-/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */
-static inline sysv_ino_t *
-sv_sb_fic_inode(struct super_block * sb, unsigned int i)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (sbi->s_bh1 == sbi->s_bh2)
- return &sbi->s_sb_fic_inodes[i];
- else {
- /* 512 byte Xenix FS */
- unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]);
- if (offset < 512)
- return (sysv_ino_t*)(sbi->s_sbd1 + offset);
- else
- return (sysv_ino_t*)(sbi->s_sbd2 + offset);
- }
-}
-
-struct sysv_inode *
-sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct sysv_inode *res;
- int block = sbi->s_firstinodezone + sbi->s_block_base;
-
- block += (ino-1) >> sbi->s_inodes_per_block_bits;
- *bh = sb_bread(sb, block);
- if (!*bh)
- return NULL;
- res = (struct sysv_inode *)(*bh)->b_data;
- return res + ((ino-1) & sbi->s_inodes_per_block_1);
-}
-
-static int refill_free_cache(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int i = 0, ino;
-
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) {
- *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino);
- if (i == sbi->s_fic_size)
- break;
- }
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- } else
- raw_inode++;
- }
- brelse(bh);
-out:
- return i;
-}
-
-void sysv_free_inode(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int ino;
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- unsigned count;
-
- sb = inode->i_sb;
- ino = inode->i_ino;
- if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
- printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
- return;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("sysv_free_inode: unable to read inode block on device "
- "%s\n", inode->i_sb->s_id);
- return;
- }
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count < sbi->s_fic_size) {
- *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- }
- fs16_add(sbi, sbi->s_sb_total_free_inodes, 1);
- dirty_sb(sb);
- memset(raw_inode, 0, sizeof(struct sysv_inode));
- mark_buffer_dirty(bh);
- mutex_unlock(&sbi->s_lock);
- brelse(bh);
-}
-
-struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *inode;
- sysv_ino_t ino;
- unsigned count;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE
- };
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
- count = refill_free_cache(sb);
- if (count == 0) {
- iput(inode);
- mutex_unlock(&sbi->s_lock);
- return ERR_PTR(-ENOSPC);
- }
- }
- /* Now count > 0. */
- ino = *sv_sb_fic_inode(sb,--count);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
- dirty_sb(sb);
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_ino = fs16_to_cpu(sbi, ino);
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
- SYSV_I(inode)->i_dir_start_lookup = 0;
- insert_inode_hash(inode);
- mark_inode_dirty(inode);
-
- sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
- mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
- /* That's it. */
- mutex_unlock(&sbi->s_lock);
- return inode;
-}
-
-unsigned long sysv_count_free_inodes(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int ino, count, sb_count;
-
- mutex_lock(&sbi->s_lock);
-
- sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0)
- count++;
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- } else
- raw_inode++;
- }
- brelse(bh);
- if (count != sb_count)
- goto Einval;
-out:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_inodes: "
- "free inode count was %d, correcting to %d\n",
- sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count);
- dirty_sb(sb);
- }
- goto out;
-
-Eio:
- printk("sysv_count_free_inodes: unable to read inode table\n");
-trust_sb:
- count = sb_count;
- goto out;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
deleted file mode 100644
index 76bc2d5e75a9..000000000000
--- a/fs/sysv/inode.c
+++ /dev/null
@@ -1,354 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for allocating/freeing inodes and for read/writing
- * the superblock.
- */
-
-#include <linux/highuid.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/writeback.h>
-#include <linux/namei.h>
-#include <asm/byteorder.h>
-#include "sysv.h"
-
-static int sysv_sync_fs(struct super_block *sb, int wait)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u32 time = (u32)ktime_get_real_seconds(), old_time;
-
- mutex_lock(&sbi->s_lock);
-
- /*
- * If we are going to write out the super block,
- * then attach current time stamp.
- * But if the filesystem was marked clean, keep it clean.
- */
- old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
- if (sbi->s_type == FSTYPE_SYSV4) {
- if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time))
- *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time);
- *sbi->s_sb_time = cpu_to_fs32(sbi, time);
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- mutex_unlock(&sbi->s_lock);
-
- return 0;
-}
-
-static int sysv_remount(struct super_block *sb, int *flags, char *data)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- sync_filesystem(sb);
- if (sbi->s_forced_ro)
- *flags |= SB_RDONLY;
- return 0;
-}
-
-static void sysv_put_super(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (!sb_rdonly(sb)) {
- /* XXX ext2 also updates the state here */
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- brelse(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- brelse(sbi->s_bh2);
-
- kfree(sbi);
-}
-
-static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-
- buf->f_type = sb->s_magic;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = sbi->s_ndatazones;
- buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb);
- buf->f_files = sbi->s_ninodes;
- buf->f_ffree = sysv_count_free_inodes(sb);
- buf->f_namelen = SYSV_NAMELEN;
- buf->f_fsid = u64_to_fsid(id);
- return 0;
-}
-
-/*
- * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32
- */
-static inline void read3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = 0;
- to[2] = from[1];
- to[3] = from[2];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- to[3] = 0;
- } else {
- to[0] = 0;
- to[1] = from[0];
- to[2] = from[1];
- to[3] = from[2];
- }
-}
-
-static inline void write3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = from[2];
- to[2] = from[3];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- } else {
- to[0] = from[1];
- to[1] = from[2];
- to[2] = from[3];
- }
-}
-
-static const struct inode_operations sysv_symlink_inode_operations = {
- .get_link = page_get_link,
- .getattr = sysv_getattr,
-};
-
-void sysv_set_inode(struct inode *inode, dev_t rdev)
-{
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &sysv_file_inode_operations;
- inode->i_fop = &sysv_file_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &sysv_dir_inode_operations;
- inode->i_fop = &sysv_dir_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &sysv_aops;
- } else
- init_special_inode(inode, inode->i_mode, rdev);
-}
-
-struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- struct inode *inode;
- unsigned int block;
-
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- sb->s_id, ino);
- return ERR_PTR(-EIO);
- }
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("Major problem: unable to read inode from dev %s\n",
- inode->i_sb->s_id);
- goto bad_inode;
- }
- /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
- inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
- i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid));
- i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
- set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
- inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
- inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
- inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
- inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
- inode->i_blocks = 0;
-
- si = SYSV_I(inode);
- for (block = 0; block < 10+1+1+1; block++)
- read3byte(sbi, &raw_inode->i_data[3*block],
- (u8 *)&si->i_data[block]);
- brelse(bh);
- si->i_dir_start_lookup = 0;
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- sysv_set_inode(inode,
- old_decode_dev(fs32_to_cpu(sbi, si->i_data[0])));
- else
- sysv_set_inode(inode, 0);
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(-EIO);
-}
-
-static int __sysv_write_inode(struct inode *inode, int wait)
-{
- struct super_block * sb = inode->i_sb;
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- unsigned int ino, block;
- int err = 0;
-
- ino = inode->i_ino;
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- inode->i_sb->s_id, ino);
- return -EIO;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("unable to read i-node block\n");
- return -EIO;
- }
-
- raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
- raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode)));
- raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
- raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
- raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
- raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
- raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
-
- si = SYSV_I(inode);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev));
- for (block = 0; block < 10+1+1+1; block++)
- write3byte(sbi, (u8 *)&si->i_data[block],
- &raw_inode->i_data[3*block]);
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk ("IO error syncing sysv inode [%s:%08x]\n",
- sb->s_id, ino);
- err = -EIO;
- }
- }
- brelse(bh);
- return err;
-}
-
-int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-}
-
-int sysv_sync_inode(struct inode *inode)
-{
- return __sysv_write_inode(inode, 1);
-}
-
-static void sysv_evict_inode(struct inode *inode)
-{
- truncate_inode_pages_final(&inode->i_data);
- if (!inode->i_nlink) {
- inode->i_size = 0;
- sysv_truncate(inode);
- }
- invalidate_inode_buffers(inode);
- clear_inode(inode);
- if (!inode->i_nlink)
- sysv_free_inode(inode);
-}
-
-static struct kmem_cache *sysv_inode_cachep;
-
-static struct inode *sysv_alloc_inode(struct super_block *sb)
-{
- struct sysv_inode_info *si;
-
- si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
- if (!si)
- return NULL;
- return &si->vfs_inode;
-}
-
-static void sysv_free_in_core_inode(struct inode *inode)
-{
- kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
-}
-
-static void init_once(void *p)
-{
- struct sysv_inode_info *si = (struct sysv_inode_info *)p;
-
- inode_init_once(&si->vfs_inode);
-}
-
-const struct super_operations sysv_sops = {
- .alloc_inode = sysv_alloc_inode,
- .free_inode = sysv_free_in_core_inode,
- .write_inode = sysv_write_inode,
- .evict_inode = sysv_evict_inode,
- .put_super = sysv_put_super,
- .sync_fs = sysv_sync_fs,
- .remount_fs = sysv_remount,
- .statfs = sysv_statfs,
-};
-
-int __init sysv_init_icache(void)
-{
- sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
- sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
- init_once);
- if (!sysv_inode_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void sysv_destroy_icache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(sysv_inode_cachep);
-}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
deleted file mode 100644
index 451e95f474fa..000000000000
--- a/fs/sysv/itree.c
+++ /dev/null
@@ -1,511 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/itree.c
- *
- * Handling of indirect blocks' trees.
- * AV, Sep--Dec 2000
- */
-
-#include <linux/buffer_head.h>
-#include <linux/mount.h>
-#include <linux/mpage.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */
-
-static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode)
-{
- mark_buffer_dirty_inode(bh, inode);
- if (IS_SYNC(inode))
- sync_dirty_buffer(bh);
-}
-
-static int block_to_path(struct inode *inode, long block, int offsets[DEPTH])
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned long indirect_blocks = sbi->s_ind_per_block,
- double_blocks = sbi->s_ind_per_block_2;
- int n = 0;
-
- if (block < 0) {
- printk("sysv_block_map: block < 0\n");
- } else if (block < DIRECT) {
- offsets[n++] = block;
- } else if ( (block -= DIRECT) < indirect_blocks) {
- offsets[n++] = DIRECT;
- offsets[n++] = block;
- } else if ((block -= indirect_blocks) < double_blocks) {
- offsets[n++] = DIRECT+1;
- offsets[n++] = block >> ptrs_bits;
- offsets[n++] = block & (indirect_blocks - 1);
- } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) {
- offsets[n++] = DIRECT+2;
- offsets[n++] = block >> (ptrs_bits * 2);
- offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1);
- offsets[n++] = block & (indirect_blocks - 1);
- } else {
- /* nothing */;
- }
- return n;
-}
-
-static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr)
-{
- return sbi->s_block_base + fs32_to_cpu(sbi, nr);
-}
-
-typedef struct {
- sysv_zone_t *p;
- sysv_zone_t key;
- struct buffer_head *bh;
-} Indirect;
-
-static DEFINE_RWLOCK(pointers_lock);
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-static inline int verify_chain(Indirect *from, Indirect *to)
-{
- while (from <= to && from->key == *from->p)
- from++;
- return (from > to);
-}
-
-static inline sysv_zone_t *block_end(struct buffer_head *bh)
-{
- return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
-}
-
-static Indirect *get_branch(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- int block = block_to_cpu(SYSV_SB(sb), p->key);
- bh = sb_bread(sb, block);
- if (!bh)
- goto failure;
- read_lock(&pointers_lock);
- if (!verify_chain(chain, p))
- goto changed;
- add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
- read_unlock(&pointers_lock);
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-changed:
- read_unlock(&pointers_lock);
- brelse(bh);
- *err = -EAGAIN;
- goto no_block;
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-static int alloc_branch(struct inode *inode,
- int num,
- int *offsets,
- Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int n = 0;
- int i;
-
- branch[0].key = sysv_new_block(inode->i_sb);
- if (branch[0].key) for (n = 1; n < num; n++) {
- struct buffer_head *bh;
- int parent;
- /* Allocate the next block */
- branch[n].key = sysv_new_block(inode->i_sb);
- if (!branch[n].key)
- break;
- /*
- * Get buffer_head for parent block, zero it out and set
- * the pointer to new one, then send parent to disk.
- */
- parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
- bh = sb_getblk(inode->i_sb, parent);
- if (!bh) {
- sysv_free_block(inode->i_sb, branch[n].key);
- break;
- }
- lock_buffer(bh);
- memset(bh->b_data, 0, blocksize);
- branch[n].bh = bh;
- branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n];
- *branch[n].p = branch[n].key;
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- dirty_indirect(bh, inode);
- }
- if (n == num)
- return 0;
-
- /* Allocation failed, free what we already allocated */
- for (i = 1; i < n; i++)
- bforget(branch[i].bh);
- for (i = 0; i < n; i++)
- sysv_free_block(inode->i_sb, branch[i].key);
- return -ENOSPC;
-}
-
-static inline int splice_branch(struct inode *inode,
- Indirect chain[],
- Indirect *where,
- int num)
-{
- int i;
-
- /* Verify that place we are splicing to is still there and vacant */
- write_lock(&pointers_lock);
- if (!verify_chain(chain, where-1) || *where->p)
- goto changed;
- *where->p = where->key;
- write_unlock(&pointers_lock);
-
- inode_set_ctime_current(inode);
-
- /* had we spliced it onto indirect block? */
- if (where->bh)
- dirty_indirect(where->bh, inode);
-
- if (IS_SYNC(inode))
- sysv_sync_inode(inode);
- else
- mark_inode_dirty(inode);
- return 0;
-
-changed:
- write_unlock(&pointers_lock);
- for (i = 1; i < num; i++)
- bforget(where[i].bh);
- for (i = 0; i < num; i++)
- sysv_free_block(inode->i_sb, where[i].key);
- return -EAGAIN;
-}
-
-static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int err = -EIO;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- struct super_block *sb = inode->i_sb;
- Indirect *partial;
- int left;
- int depth = block_to_path(inode, iblock, offsets);
-
- if (depth == 0)
- goto out;
-
-reread:
- partial = get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
-got_it:
- map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb),
- chain[depth-1].key));
- /* Clean up and exit */
- partial = chain+depth-1; /* the whole chain */
- goto cleanup;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO) {
-cleanup:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
-out:
- return err;
- }
-
- /*
- * Indirect block might be removed by truncate while we were
- * reading it. Handling of that case (forget what we've got and
- * reread) is taken out of the main path.
- */
- if (err == -EAGAIN)
- goto changed;
-
- left = (chain + depth) - partial;
- err = alloc_branch(inode, left, offsets+(partial-chain), partial);
- if (err)
- goto cleanup;
-
- if (splice_branch(inode, chain, partial, left) < 0)
- goto changed;
-
- set_buffer_new(bh_result);
- goto got_it;
-
-changed:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
- goto reread;
-}
-
-static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q)
-{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
-
-static Indirect *find_shared(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- sysv_zone_t *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = get_branch(inode, k, offsets, chain, &err);
-
- write_lock(&pointers_lock);
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p) {
- write_unlock(&pointers_lock);
- goto no_top;
- }
- for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- *p->p = 0;
- }
- write_unlock(&pointers_lock);
-
- while (partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
-}
-
-static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q)
-{
- for ( ; p < q ; p++) {
- sysv_zone_t nr = *p;
- if (nr) {
- *p = 0;
- sysv_free_block(inode->i_sb, nr);
- mark_inode_dirty(inode);
- }
- }
-}
-
-static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth)
-{
- struct buffer_head * bh;
- struct super_block *sb = inode->i_sb;
-
- if (depth--) {
- for ( ; p < q ; p++) {
- int block;
- sysv_zone_t nr = *p;
- if (!nr)
- continue;
- *p = 0;
- block = block_to_cpu(SYSV_SB(sb), nr);
- bh = sb_bread(sb, block);
- if (!bh)
- continue;
- free_branches(inode, (sysv_zone_t*)bh->b_data,
- block_end(bh), depth);
- bforget(bh);
- sysv_free_block(sb, nr);
- mark_inode_dirty(inode);
- }
- } else
- free_data(inode, p, q);
-}
-
-void sysv_truncate (struct inode * inode)
-{
- sysv_zone_t *i_data = SYSV_I(inode)->i_data;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- Indirect *partial;
- sysv_zone_t nr = 0;
- int n;
- long iblock;
- unsigned blocksize;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
-
- blocksize = inode->i_sb->s_blocksize;
- iblock = (inode->i_size + blocksize-1)
- >> inode->i_sb->s_blocksize_bits;
-
- block_truncate_page(inode->i_mapping, inode->i_size, get_block);
-
- n = block_to_path(inode, iblock, offsets);
- if (n == 0)
- return;
-
- if (n == 1) {
- free_data(inode, i_data+offsets[0], i_data + DIRECT);
- goto do_indirects;
- }
-
- partial = find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (already detached) */
- if (nr) {
- if (partial == chain)
- mark_inode_dirty(inode);
- else
- dirty_indirect(partial->bh, inode);
- free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- free_branches(inode, partial->p + 1, block_end(partial->bh),
- (chain+n-1) - partial);
- dirty_indirect(partial->bh, inode);
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */
- while (n < DEPTH) {
- nr = i_data[DIRECT + n - 1];
- if (nr) {
- i_data[DIRECT + n - 1] = 0;
- mark_inode_dirty(inode);
- free_branches(inode, &nr, &nr+1, n);
- }
- n++;
- }
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- if (IS_SYNC(inode))
- sysv_sync_inode (inode);
- else
- mark_inode_dirty(inode);
-}
-
-static unsigned sysv_nblocks(struct super_block *s, loff_t size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(s);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned blocks, res, direct = DIRECT, i = DEPTH;
- blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits;
- res = blocks;
- while (--i && blocks > direct) {
- blocks = ((blocks - direct - 1) >> ptrs_bits) + 1;
- res += blocks;
- direct = 1;
- }
- return res;
-}
-
-int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask, unsigned int flags)
-{
- struct super_block *s = path->dentry->d_sb;
- generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
- stat);
- stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
- stat->blksize = s->s_blocksize;
- return 0;
-}
-
-static int sysv_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return mpage_writepages(mapping, wbc, get_block);
-}
-
-static int sysv_read_folio(struct file *file, struct folio *folio)
-{
- return block_read_full_folio(folio, get_block);
-}
-
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- return __block_write_begin(folio, pos, len, get_block);
-}
-
-static void sysv_write_failed(struct address_space *mapping, loff_t to)
-{
- struct inode *inode = mapping->host;
-
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- sysv_truncate(inode);
- }
-}
-
-static int sysv_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- int ret;
-
- ret = block_write_begin(mapping, pos, len, foliop, get_block);
- if (unlikely(ret))
- sysv_write_failed(mapping, pos + len);
-
- return ret;
-}
-
-static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
-{
- return generic_block_bmap(mapping,block,get_block);
-}
-
-const struct address_space_operations sysv_aops = {
- .dirty_folio = block_dirty_folio,
- .invalidate_folio = block_invalidate_folio,
- .read_folio = sysv_read_folio,
- .writepages = sysv_writepages,
- .write_begin = sysv_write_begin,
- .write_end = generic_write_end,
- .migrate_folio = buffer_migrate_folio,
- .bmap = sysv_bmap
-};
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
deleted file mode 100644
index fb8bd8437872..000000000000
--- a/fs/sysv/namei.c
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/namei.c
- *
- * minix/namei.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/namei.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/namei.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- */
-
-#include <linux/pagemap.h>
-#include "sysv.h"
-
-static int add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = sysv_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
-{
- struct inode * inode = NULL;
- ino_t ino;
-
- if (dentry->d_name.len > SYSV_NAMELEN)
- return ERR_PTR(-ENAMETOOLONG);
- ino = sysv_inode_by_name(dentry);
- if (ino)
- inode = sysv_iget(dir->i_sb, ino);
- return d_splice_alias(inode, dentry);
-}
-
-static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
-{
- struct inode * inode;
- int err;
-
- if (!old_valid_dev(rdev))
- return -EINVAL;
-
- inode = sysv_new_inode(dir, mode);
- err = PTR_ERR(inode);
-
- if (!IS_ERR(inode)) {
- sysv_set_inode(inode, rdev);
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int sysv_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
-{
- return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
-}
-
-static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, const char *symname)
-{
- int err = -ENAMETOOLONG;
- int l = strlen(symname)+1;
- struct inode * inode;
-
- if (l > dir->i_sb->s_blocksize)
- goto out;
-
- inode = sysv_new_inode(dir, S_IFLNK|0777);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- sysv_set_inode(inode, 0);
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
-
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int sysv_link(struct dentry * old_dentry, struct inode * dir,
- struct dentry * dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode_set_ctime_current(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return add_nondir(dentry, inode);
-}
-
-static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- struct inode * inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = sysv_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- sysv_set_inode(inode, 0);
-
- inode_inc_link_count(inode);
-
- err = sysv_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = sysv_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int sysv_unlink(struct inode * dir, struct dentry * dentry)
-{
- struct inode * inode = d_inode(dentry);
- struct folio *folio;
- struct sysv_dir_entry * de;
- int err;
-
- de = sysv_find_entry(dentry, &folio);
- if (!de)
- return -ENOENT;
-
- err = sysv_delete_entry(de, folio);
- if (!err) {
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- }
- folio_release_kmap(folio, de);
- return err;
-}
-
-static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (sysv_empty_dir(inode)) {
- err = sysv_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-/*
- * Anybody can rename anything with this: the permission checks are left to the
- * higher-level routines.
- */
-static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
- struct dentry *old_dentry, struct inode *new_dir,
- struct dentry *new_dentry, unsigned int flags)
-{
- struct inode * old_inode = d_inode(old_dentry);
- struct inode * new_inode = d_inode(new_dentry);
- struct folio *dir_folio;
- struct sysv_dir_entry * dir_de = NULL;
- struct folio *old_folio;
- struct sysv_dir_entry * old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = sysv_find_entry(old_dentry, &old_folio);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = sysv_dotdot(old_inode, &dir_folio);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct folio *new_folio;
- struct sysv_dir_entry * new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !sysv_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = sysv_find_entry(new_dentry, &new_folio);
- if (!new_de)
- goto out_dir;
- err = sysv_set_link(new_de, new_folio, old_inode);
- folio_release_kmap(new_folio, new_de);
- if (err)
- goto out_dir;
- inode_set_ctime_current(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- } else {
- err = sysv_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- err = sysv_delete_entry(old_de, old_folio);
- if (err)
- goto out_dir;
-
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = sysv_set_link(dir_de, dir_folio, new_dir);
- if (!err)
- inode_dec_link_count(old_dir);
- }
-
-out_dir:
- if (dir_de)
- folio_release_kmap(dir_folio, dir_de);
-out_old:
- folio_release_kmap(old_folio, old_de);
-out:
- return err;
-}
-
-/*
- * directories can handle most operations...
- */
-const struct inode_operations sysv_dir_inode_operations = {
- .create = sysv_create,
- .lookup = sysv_lookup,
- .link = sysv_link,
- .unlink = sysv_unlink,
- .symlink = sysv_symlink,
- .mkdir = sysv_mkdir,
- .rmdir = sysv_rmdir,
- .mknod = sysv_mknod,
- .rename = sysv_rename,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
deleted file mode 100644
index 5c0d07ddbda2..000000000000
--- a/fs/sysv/super.c
+++ /dev/null
@@ -1,595 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for read/parsing the superblock.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/buffer_head.h>
-#include "sysv.h"
-
-/*
- * The following functions try to recognize specific filesystems.
- *
- * We recognize:
- * - Xenix FS by its magic number.
- * - SystemV FS by its magic number.
- * - Coherent FS by its funny fname/fpack field.
- * - SCO AFS by s_nfree == 0xffff
- * - V7 FS has no distinguishing features.
- *
- * We discriminate among SystemV4 and SystemV2 FS by the assumption that
- * the time stamp is not < 01-01-1980.
- */
-
-enum {
- JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
-};
-
-static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
- struct xenix_super_block * sbd1;
- struct xenix_super_block * sbd2;
-
- if (bh1 != bh2)
- sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data;
- else {
- /* block size = 512, so bh1 != bh2 */
- sbd1 = (struct xenix_super_block *) bh1->b_data;
- sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
- }
-
- *max_links = XENIX_LINK_MAX;
- sbi->s_fic_size = XENIX_NICINOD;
- sbi->s_flc_size = XENIX_NICFREE;
- sbi->s_sbd1 = (char *)sbd1;
- sbi->s_sbd2 = (char *)sbd2;
- sbi->s_sb_fic_count = &sbd1->s_ninode;
- sbi->s_sb_fic_inodes = &sbd1->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd2->s_tinode;
- sbi->s_bcache_count = &sbd1->s_nfree;
- sbi->s_bcache = &sbd1->s_free[0];
- sbi->s_free_blocks = &sbd2->s_tfree;
- sbi->s_sb_time = &sbd2->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
-}
-
-static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv4_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv4_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv2_super_block *sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv2_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct coh_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
-
- sbd = (struct coh_super_block *) bh1->b_data;
-
- *max_links = COH_LINK_MAX;
- sbi->s_fic_size = COH_NICINOD;
- sbi->s_flc_size = COH_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh2 = sbi->s_bh2;
- struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
-
- *max_links = V7_LINK_MAX;
- sbi->s_fic_size = V7_NICINOD;
- sbi->s_flc_size = V7_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data;
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
- switch (fs32_to_cpu(sbi, sbd->s_type)) {
- case 1:
- sbi->s_type = FSTYPE_XENIX;
- return 1;
- case 2:
- sbi->s_type = FSTYPE_XENIX;
- return 2;
- default:
- return 0;
- }
-}
-
-static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct super_block *sb = sbi->s_sb;
- /* All relevant fields are at the same offsets in R2 and R4 */
- struct sysv4_super_block * sbd;
- u32 type;
-
- sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
-
- type = fs32_to_cpu(sbi, sbd->s_type);
-
- if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) {
- sbi->s_type = FSTYPE_AFS;
- sbi->s_forced_ro = 1;
- if (!sb_rdonly(sb)) {
- printk("SysV FS: SCO EAFS on %s detected, "
- "forcing read-only mode.\n",
- sb->s_id);
- }
- return type;
- }
-
- if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) {
- /* this is likely to happen on SystemV2 FS */
- if (type > 3 || type < 1)
- return 0;
- sbi->s_type = FSTYPE_SYSV2;
- return type;
- }
- if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10))
- return 0;
-
- /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10,
- 0x20 or 0x30 indicates that symbolic links and the 14-character
- filename limit is gone. Due to lack of information about this
- feature read-only mode seems to be a reasonable approach... -KGB */
-
- if (type >= 0x10) {
- printk("SysV FS: can't handle long file names on %s, "
- "forcing read-only mode.\n", sb->s_id);
- sbi->s_forced_ro = 1;
- }
-
- sbi->s_type = FSTYPE_SYSV4;
- return type >= 0x10 ? type >> 4 : type;
-}
-
-static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct coh_super_block * sbd;
-
- sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6))
- || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6)))
- return 0;
- sbi->s_bytesex = BYTESEX_PDP;
- sbi->s_type = FSTYPE_COH;
- return 1;
-}
-
-static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- int size = detect_sysv(sbi, bh);
-
- return size>2 ? 0 : size;
-}
-
-static struct {
- int block;
- int (*test)(struct sysv_sb_info *, struct buffer_head *);
-} flavours[] = {
- {1, detect_xenix},
- {0, detect_sysv},
- {0, detect_coherent},
- {9, detect_sysv_odd},
- {15,detect_sysv_odd},
- {18,detect_sysv},
-};
-
-static char *flavour_names[] = {
- [FSTYPE_XENIX] = "Xenix",
- [FSTYPE_SYSV4] = "SystemV",
- [FSTYPE_SYSV2] = "SystemV Release 2",
- [FSTYPE_COH] = "Coherent",
- [FSTYPE_V7] = "V7",
- [FSTYPE_AFS] = "AFS",
-};
-
-static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {
- [FSTYPE_XENIX] = detected_xenix,
- [FSTYPE_SYSV4] = detected_sysv4,
- [FSTYPE_SYSV2] = detected_sysv2,
- [FSTYPE_COH] = detected_coherent,
- [FSTYPE_V7] = detected_v7,
- [FSTYPE_AFS] = detected_sysv4,
-};
-
-static int complete_read_super(struct super_block *sb, int silent, int size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *root_inode;
- char *found = flavour_names[sbi->s_type];
- u_char n_bits = size+8;
- int bsize = 1 << n_bits;
- int bsize_4 = bsize >> 2;
-
- sbi->s_firstinodezone = 2;
-
- flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
- if (sbi->s_firstdatazone < sbi->s_firstinodezone)
- return 0;
-
- sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
- sbi->s_inodes_per_block = bsize >> 6;
- sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
- sbi->s_inodes_per_block_bits = n_bits-6;
- sbi->s_ind_per_block = bsize_4;
- sbi->s_ind_per_block_2 = bsize_4*bsize_4;
- sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4));
- sbi->s_ind_per_block_bits = n_bits-2;
-
- sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone)
- << sbi->s_inodes_per_block_bits;
-
- if (!silent)
- printk("VFS: Found a %s FS (block size = %ld) on device %s\n",
- found, sb->s_blocksize, sb->s_id);
-
- sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
- /* set up enough so that it can read an inode */
- sb->s_op = &sysv_sops;
- if (sbi->s_forced_ro)
- sb->s_flags |= SB_RDONLY;
- root_inode = sysv_iget(sb, SYSV_ROOT_INO);
- if (IS_ERR(root_inode)) {
- printk("SysV FS: get root inode failed\n");
- return 0;
- }
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root) {
- printk("SysV FS: get root dentry failed\n");
- return 0;
- }
- return 1;
-}
-
-static int sysv_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct buffer_head *bh1, *bh = NULL;
- struct sysv_sb_info *sbi;
- unsigned long blocknr;
- int size = 0, i;
-
- BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
- BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
- BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
- sb_set_blocksize(sb, BLOCK_SIZE);
-
- for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
- brelse(bh);
- bh = sb_bread(sb, flavours[i].block);
- if (!bh)
- continue;
- size = flavours[i].test(SYSV_SB(sb), bh);
- }
-
- if (!size)
- goto Eunknown;
-
- switch (size) {
- case 1:
- blocknr = bh->b_blocknr << 1;
- brelse(bh);
- sb_set_blocksize(sb, 512);
- bh1 = sb_bread(sb, blocknr);
- bh = sb_bread(sb, blocknr + 1);
- break;
- case 2:
- bh1 = bh;
- break;
- case 3:
- blocknr = bh->b_blocknr >> 1;
- brelse(bh);
- sb_set_blocksize(sb, 2048);
- bh1 = bh = sb_bread(sb, blocknr);
- break;
- default:
- goto Ebadsize;
- }
-
- if (bh && bh1) {
- sbi->s_bh1 = bh1;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, size))
- return 0;
- }
-
- brelse(bh1);
- brelse(bh);
- sb_set_blocksize(sb, BLOCK_SIZE);
- printk("oldfs: cannot read superblock\n");
-failed:
- kfree(sbi);
- return -EINVAL;
-
-Eunknown:
- brelse(bh);
- if (!silent)
- printk("VFS: unable to find oldfs superblock on device %s\n",
- sb->s_id);
- goto failed;
-Ebadsize:
- brelse(bh);
- if (!silent)
- printk("VFS: oldfs: unsupported block size (%dKb)\n",
- 1<<(size-2));
- goto failed;
-}
-
-static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
-{
- struct v7_super_block *v7sb;
- struct sysv_inode *v7i;
- struct buffer_head *bh2;
- struct sysv_sb_info *sbi;
-
- sbi = sb->s_fs_info;
-
- /* plausibility check on superblock */
- v7sb = (struct v7_super_block *) bh->b_data;
- if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
- fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
- fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
- return 0;
-
- /* plausibility check on root inode: it is a directory,
- with a nonzero size that is a multiple of 16 */
- bh2 = sb_bread(sb, 2);
- if (bh2 == NULL)
- return 0;
-
- v7i = (struct sysv_inode *)(bh2->b_data + 64);
- if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
- (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
- (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
- (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
- sizeof(struct sysv_dir_entry))) {
- brelse(bh2);
- return 0;
- }
-
- brelse(bh2);
- return 1;
-}
-
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct sysv_sb_info *sbi;
- struct buffer_head *bh;
-
- BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
- BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- sbi->s_type = FSTYPE_V7;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
-
- sb_set_blocksize(sb, 512);
-
- if ((bh = sb_bread(sb, 1)) == NULL) {
- if (!silent)
- printk("VFS: unable to read V7 FS superblock on "
- "device %s.\n", sb->s_id);
- goto failed;
- }
-
- /* Try PDP-11 UNIX */
- sbi->s_bytesex = BYTESEX_PDP;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- /* Try PC/IX, v7/x86 */
- sbi->s_bytesex = BYTESEX_LE;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- goto failed;
-
-detected:
- sbi->s_bh1 = bh;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, 1))
- return 0;
-
-failed:
- printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
- sb->s_id);
- brelse(bh);
- kfree(sbi);
- return -EINVAL;
-}
-
-/* Every kernel module contains stuff like this. */
-
-static struct dentry *sysv_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-}
-
-static struct dentry *v7_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
-}
-
-static struct file_system_type sysv_fs_type = {
- .owner = THIS_MODULE,
- .name = "sysv",
- .mount = sysv_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("sysv");
-
-static struct file_system_type v7_fs_type = {
- .owner = THIS_MODULE,
- .name = "v7",
- .mount = v7_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("v7");
-MODULE_ALIAS("v7");
-
-static int __init init_sysv_fs(void)
-{
- int error;
-
- error = sysv_init_icache();
- if (error)
- goto out;
- error = register_filesystem(&sysv_fs_type);
- if (error)
- goto destroy_icache;
- error = register_filesystem(&v7_fs_type);
- if (error)
- goto unregister;
- return 0;
-
-unregister:
- unregister_filesystem(&sysv_fs_type);
-destroy_icache:
- sysv_destroy_icache();
-out:
- return error;
-}
-
-static void __exit exit_sysv_fs(void)
-{
- unregister_filesystem(&sysv_fs_type);
- unregister_filesystem(&v7_fs_type);
- sysv_destroy_icache();
-}
-
-module_init(init_sysv_fs)
-module_exit(exit_sysv_fs)
-MODULE_DESCRIPTION("SystemV Filesystem");
-MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
deleted file mode 100644
index 0a48b2e7edb1..000000000000
--- a/fs/sysv/sysv.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SYSV_H
-#define _SYSV_H
-
-#include <linux/buffer_head.h>
-
-typedef __u16 __bitwise __fs16;
-typedef __u32 __bitwise __fs32;
-
-#include <linux/sysv_fs.h>
-
-/*
- * SystemV/V7/Coherent super-block data in memory
- *
- * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified
- * while the system is running). This is in contrast to the Minix and Berkeley
- * filesystems (where the superblock is never modified). This affects the
- * sync() operation: we must keep the superblock in a disk buffer and use this
- * one as our "working copy".
- */
-
-struct sysv_sb_info {
- struct super_block *s_sb; /* VFS superblock */
- int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */
- char s_bytesex; /* bytesex (le/be/pdp) */
- unsigned int s_inodes_per_block; /* number of inodes per block */
- unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */
- unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */
- unsigned int s_ind_per_block; /* number of indirections per block */
- unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */
- unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */
- unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */
- unsigned int s_block_base; /* physical block number of block 0 */
- unsigned short s_fic_size; /* free inode cache size, NICINOD */
- unsigned short s_flc_size; /* free block list chunk size, NICFREE */
- /* The superblock is kept in one or two disk buffers: */
- struct buffer_head *s_bh1;
- struct buffer_head *s_bh2;
- /* These are pointers into the disk buffer, to compensate for
- different superblock layout. */
- char * s_sbd1; /* entire superblock data, for part 1 */
- char * s_sbd2; /* entire superblock data, for part 2 */
- __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */
- sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */
- __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */
- __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */
- sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */
- __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */
- __fs32 *s_sb_time; /* pointer to s_sbd->s_time */
- __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */
- /* We keep those superblock entities that don't change here;
- this saves us an indirection and perhaps a conversion. */
- u32 s_firstinodezone; /* index of first inode zone */
- u32 s_firstdatazone; /* same as s_sbd->s_isize */
- u32 s_ninodes; /* total number of inodes */
- u32 s_ndatazones; /* total number of data zones */
- u32 s_nzones; /* same as s_sbd->s_fsize */
- u16 s_namelen; /* max length of dir entry */
- int s_forced_ro;
- struct mutex s_lock;
-};
-
-/*
- * SystemV/V7/Coherent FS inode data in memory
- */
-struct sysv_inode_info {
- __fs32 i_data[13];
- u32 i_dir_start_lookup;
- struct inode vfs_inode;
-};
-
-
-static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
-{
- return container_of(inode, struct sysv_inode_info, vfs_inode);
-}
-
-static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-
-/* identify the FS in memory */
-enum {
- FSTYPE_NONE = 0,
- FSTYPE_XENIX,
- FSTYPE_SYSV4,
- FSTYPE_SYSV2,
- FSTYPE_COH,
- FSTYPE_V7,
- FSTYPE_AFS,
- FSTYPE_END,
-};
-
-#define SYSV_MAGIC_BASE 0x012FF7B3
-
-#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX)
-#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4)
-#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2)
-#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH)
-
-
-/* Admissible values for i_nlink: 0.._LINK_MAX */
-enum {
- XENIX_LINK_MAX = 126, /* ?? */
- SYSV_LINK_MAX = 126, /* 127? 251? */
- V7_LINK_MAX = 126, /* ?? */
- COH_LINK_MAX = 10000,
-};
-
-
-static inline void dirty_sb(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
-}
-
-
-/* ialloc.c */
-extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
- struct buffer_head **);
-extern struct inode * sysv_new_inode(const struct inode *, umode_t);
-extern void sysv_free_inode(struct inode *);
-extern unsigned long sysv_count_free_inodes(struct super_block *);
-
-/* balloc.c */
-extern sysv_zone_t sysv_new_block(struct super_block *);
-extern void sysv_free_block(struct super_block *, sysv_zone_t);
-extern unsigned long sysv_count_free_blocks(struct super_block *);
-
-/* itree.c */
-void sysv_truncate(struct inode *);
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
-
-/* inode.c */
-extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
-extern int sysv_sync_inode(struct inode *);
-extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(struct mnt_idmap *, const struct path *,
- struct kstat *, u32, unsigned int);
-extern int sysv_init_icache(void);
-extern void sysv_destroy_icache(void);
-
-
-/* dir.c */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **);
-int sysv_add_link(struct dentry *, struct inode *);
-int sysv_delete_entry(struct sysv_dir_entry *, struct folio *);
-int sysv_make_empty(struct inode *, struct inode *);
-int sysv_empty_dir(struct inode *);
-int sysv_set_link(struct sysv_dir_entry *, struct folio *,
- struct inode *);
-struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **);
-ino_t sysv_inode_by_name(struct dentry *);
-
-
-extern const struct inode_operations sysv_file_inode_operations;
-extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct file_operations sysv_file_operations;
-extern const struct file_operations sysv_dir_operations;
-extern const struct address_space_operations sysv_aops;
-extern const struct super_operations sysv_sops;
-
-
-enum {
- BYTESEX_LE,
- BYTESEX_PDP,
- BYTESEX_BE,
-};
-
-static inline u32 PDP_swab(u32 x)
-{
-#ifdef __LITTLE_ENDIAN
- return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16);
-#else
-#ifdef __BIG_ENDIAN
- return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8);
-#else
-#error BYTESEX
-#endif
-#endif
-}
-
-static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return PDP_swab((__force __u32)n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return le32_to_cpu((__force __le32)n);
- else
- return be32_to_cpu((__force __be32)n);
-}
-
-static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return (__force __fs32)PDP_swab(n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return (__force __fs32)cpu_to_le32(n);
- else
- return (__force __fs32)cpu_to_be32(n);
-}
-
-static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
- else if (sbi->s_bytesex == BYTESEX_LE)
- le32_add_cpu((__le32 *)n, d);
- else
- be32_add_cpu((__be32 *)n, d);
- return *n;
-}
-
-static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return le16_to_cpu((__force __le16)n);
- else
- return be16_to_cpu((__force __be16)n);
-}
-
-static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return (__force __fs16)cpu_to_le16(n);
- else
- return (__force __fs16)cpu_to_be16(n);
-}
-
-static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- le16_add_cpu((__le16 *)n, d);
- else
- be16_add_cpu((__be16 *)n, d);
- return *n;
-}
-
-#endif /* _SYSV_H */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9f7eb451a60f..c68f28d9c426 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
} else {
- hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode);
hrtimer_set_expires(&ctx->t.tmr, texp);
- ctx->t.tmr.function = timerfd_tmrproc;
}
if (texp != 0) {
@@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
else
- hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS);
ctx->moffs = ktime_mono_to_real(0);
@@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
return ufd;
}
- file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx,
- O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+ file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
+ O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
fd_install(ufd, file);
return ufd;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 53214499e384..cb1af30b49f5 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry)
return name;
}
-static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
- struct inode *inode, struct dentry *dentry,
- umode_t mode)
+static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap,
+ struct inode *inode, struct dentry *dentry,
+ umode_t mode)
{
struct tracefs_inode *ti;
char *name;
@@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
name = get_dname(dentry);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/*
* This is a new directory that does not take the default of
@@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
kfree(name);
- return ret;
+ return ERR_PTR(ret);
}
static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index fda82f3e16e8..3c3d3ad4fa6c 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1002,8 +1002,8 @@ out_fname:
return err;
}
-static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = ubifs_budget_space(c, &req);
if (err)
- return err;
+ return ERR_PTR(err);
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
@@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
- return 0;
+ return NULL;
out_cancel:
dir->i_size -= sz_change;
@@ -1074,7 +1074,7 @@ out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
- return err;
+ return ERR_PTR(err);
}
static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01d8eb170382..a79f229df475 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
wbuf->c = c;
wbuf->next_ino = 0;
- hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- wbuf->timer.function = wbuf_timer_callback_nolock;
+ hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
return 0;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2cb49b6b0716..5f2e9a892bff 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir,
return udf_add_nondir(dentry, inode);
}
-static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct udf_fileident_iter iter;
@@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = udf_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
iinfo = UDF_I(inode);
inode->i_op = &udf_dir_inode_operations;
@@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
set_nlink(inode, 2);
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
}
static int empty_dir(struct inode *dir)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 38a024c8cccd..5b3c85c93242 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return error;
}
-static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
- struct dentry * dentry, umode_t mode)
+static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
+ struct dentry * dentry, umode_t mode)
{
struct inode * inode;
int err;
@@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
goto out_fail;
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
out_fail:
inode_dec_link_count(inode);
@@ -202,7 +202,7 @@ out_fail:
discard_new_inode(inode);
out_dir:
inode_dec_link_count(dir);
- return err;
+ return ERR_PTR(err);
}
static int ufs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index da786a687fdc..4ad2c36550f1 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -10,6 +10,7 @@ config UNICODE
be a separate loadable module that gets requested only when a file
system actually use it.
-config UNICODE_NORMALIZATION_SELFTEST
+config UNICODE_NORMALIZATION_KUNIT_TEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE && KUNIT
+ default KUNIT_ALL_TESTS
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index e309afe2b2bb..d95be7fb9f6b 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),)
obj-y += unicode.o
endif
obj-$(CONFIG_UNICODE) += utf8data.o
-obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
+obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o
unicode-y := utf8-norm.o utf8-core.o
diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig
new file mode 100644
index 000000000000..62dd5c171f9c
--- /dev/null
+++ b/fs/unicode/tests/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_UNICODE=y
+CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c
index 5ddaf27b21a6..5063e8138aec 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/tests/utf8_kunit.c
@@ -1,34 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel module for testing utf-8 support.
+ * KUnit tests for utf-8 support.
*
* Copyright 2017 Collabora Ltd.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/printk.h>
#include <linux/unicode.h>
-#include <linux/dcache.h>
-
-#include "utf8n.h"
-
-static unsigned int failed_tests;
-static unsigned int total_tests;
-
-#define _test(cond, func, line, fmt, ...) do { \
- total_tests++; \
- if (!cond) { \
- failed_tests++; \
- pr_err("test %s:%d Failed: %s%s", \
- func, line, #cond, (fmt?":":".")); \
- if (fmt) \
- pr_err(fmt, ##__VA_ARGS__); \
- } \
- } while (0)
-#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define test(cond) _test(cond, __func__, __LINE__, "")
+#include <kunit/test.h>
+
+#include "../utf8n.h"
static const struct {
/* UTF-8 strings in this vector _must_ be NULL-terminated. */
@@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
}
-static void check_utf8_nfdi(struct unicode_map *um)
+static void check_utf8_nfdi(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
int len = strlen(nfdi_test_data[i].str);
int nlen = strlen(nfdi_test_data[i].dec);
int j = 0;
unsigned char c;
+ int ret;
+
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
+ nlen);
- test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
- test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
- nlen));
- if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdi_test_data[i].dec[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdi_test_data[i].dec[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdi_test_data[i].dec[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_nfdicf(struct unicode_map *um)
+static void check_utf8_nfdicf(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
int len = strlen(nfdicf_test_data[i].str);
int nlen = strlen(nfdicf_test_data[i].ncf);
int j = 0;
+ int ret;
unsigned char c;
- test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
- nlen));
- test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
- nlen));
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
+ nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
+ nlen);
- if (utf8cursor(&u8c, um, UTF8_NFDICF,
- nfdicf_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdicf_test_data[i].ncf[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdicf_test_data[i].ncf[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdicf_test_data[i].ncf[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_comparisons(struct unicode_map *table)
+static void check_utf8_comparisons(struct kunit *test)
{
int i;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
const struct qstr s1 = {.name = nfdi_test_data[i].str,
@@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};
- test_f(!utf8_strncmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
@@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};
- test_f(!utf8_strncasecmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncasecmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
}
-static void check_supported_versions(struct unicode_map *um)
+static void check_supported_versions(struct kunit *test)
{
+ struct unicode_map *um = test->priv;
/* Unicode 7.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
/* Unicode 9.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
/* Unicode 1x.0.0 (the latest version) should be supported. */
- test(utf8version_is_supported(um, UTF8_LATEST));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
/* Next versions don't exist. */
- test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
}
-static int __init init_test_ucd(void)
+static struct kunit_case unicode_normalization_test_cases[] = {
+ KUNIT_CASE(check_supported_versions),
+ KUNIT_CASE(check_utf8_comparisons),
+ KUNIT_CASE(check_utf8_nfdicf),
+ KUNIT_CASE(check_utf8_nfdi),
+ {}
+};
+
+static int init_test_ucd(struct kunit *test)
{
- struct unicode_map *um;
+ struct unicode_map *um = utf8_load(UTF8_LATEST);
- failed_tests = 0;
- total_tests = 0;
+ test->priv = um;
- um = utf8_load(UTF8_LATEST);
- if (IS_ERR(um)) {
- pr_err("%s: Unable to load utf8 table.\n", __func__);
- return PTR_ERR(um);
- }
+ KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0,
+ "%s: Unable to load utf8 table.\n", __func__);
- check_supported_versions(um);
- check_utf8_nfdi(um);
- check_utf8_nfdicf(um);
- check_utf8_comparisons(um);
-
- if (!failed_tests)
- pr_info("All %u tests passed\n", total_tests);
- else
- pr_err("%u out of %u tests failed\n", failed_tests,
- total_tests);
- utf8_unload(um);
return 0;
}
-static void __exit exit_test_ucd(void)
+static void exit_test_ucd(struct kunit *test)
{
+ utf8_unload(test->priv);
}
-module_init(init_test_ucd);
-module_exit(exit_test_ucd);
+static struct kunit_suite unicode_normalization_test_suite = {
+ .name = "unicode_normalization",
+ .test_cases = unicode_normalization_test_cases,
+ .init = init_test_ucd,
+ .exit = exit_test_ucd,
+};
+
+kunit_test_suite(unicode_normalization_test_suite);
+
MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
-MODULE_DESCRIPTION("Kernel module for testing utf-8 support");
+MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
MODULE_LICENSE("GPL");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 768f8ab448b8..7b998c99c88d 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -586,7 +586,7 @@ ccc_mismatch:
}
}
-#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
+#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST)
EXPORT_SYMBOL_GPL(utf8version_is_supported);
EXPORT_SYMBOL_GPL(utf8nlen);
EXPORT_SYMBOL_GPL(utf8ncursor);
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index a859ac9b74ba..770e29ec3557 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap,
return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL);
}
-static int vboxsf_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *parent, struct dentry *dentry,
- umode_t mode)
+static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *parent, struct dentry *dentry,
+ umode_t mode)
{
- return vboxsf_dir_create(parent, dentry, mode, true, true, NULL);
+ return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL));
}
static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index e1036e535352..40569d3527a7 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -4,13 +4,9 @@ config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
select CRYPTO
select CRYPTO_HASH_INFO
- # SHA-256 is implied as it's intended to be the default hash algorithm.
+ # SHA-256 is selected as it's intended to be the default hash algorithm.
# To avoid bloat, other wanted algorithms must be selected explicitly.
- # Note that CRYPTO_SHA256 denotes the generic C implementation, but
- # some architectures provided optimized implementations of the same
- # algorithm that may be used instead. In this case, CRYPTO_SHA256 may
- # be omitted even if SHA-256 is being used.
- imply CRYPTO_SHA256
+ select CRYPTO_SHA256
help
This option enables fs-verity. fs-verity is the dm-verity
mechanism implemented at the file level. On supported
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7afa51e41427..5bf501cf8271 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
+ xfs_zones.o \
)
# highlevel code
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_zone_alloc.o \
+ xfs_zone_gc.o \
+ xfs_zone_info.o \
+ xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index b59cb461e096..e6ba914f6d06 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -301,7 +301,7 @@ xfs_get_aghdr_buf(
struct xfs_buf *bp;
int error;
- error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0ef19f1469ec..63255820b58a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,13 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -171,18 +171,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
-STATIC xfs_filblks_t
+xfs_filblks_t
xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
+ struct xfs_mount *mp = ip->i_mount;
+ int maxrecs = mp->m_bmap_dmxr[0];
+ int level;
+ xfs_filblks_t rval;
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@@ -2572,146 +2570,6 @@ done:
}
/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork,
- struct xfs_iext_cursor *icur,
- xfs_bmbt_irec_t *new) /* new data to add to file extents */
-{
- struct xfs_ifork *ifp; /* inode fork pointer */
- xfs_bmbt_irec_t left; /* left neighbor extent entry */
- xfs_filblks_t newlen=0; /* new indirect size */
- xfs_filblks_t oldlen=0; /* old indirect size */
- xfs_bmbt_irec_t right; /* right neighbor extent entry */
- uint32_t state = xfs_bmap_fork_to_state(whichfork);
- xfs_filblks_t temp; /* temp for indirect calculations */
-
- ifp = xfs_ifork_ptr(ip, whichfork);
- ASSERT(isnullstartblock(new->br_startblock));
-
- /*
- * Check and set flags if this segment has a left neighbor
- */
- if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
- state |= BMAP_LEFT_VALID;
- if (isnullstartblock(left.br_startblock))
- state |= BMAP_LEFT_DELAY;
- }
-
- /*
- * Check and set flags if the current (right) segment exists.
- * If it doesn't exist, we're converting the hole at end-of-file.
- */
- if (xfs_iext_get_extent(ifp, icur, &right)) {
- state |= BMAP_RIGHT_VALID;
- if (isnullstartblock(right.br_startblock))
- state |= BMAP_RIGHT_DELAY;
- }
-
- /*
- * Set contiguity flags on the left and right neighbors.
- * Don't let extents get too large, even if the pieces are contiguous.
- */
- if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
- state |= BMAP_LEFT_CONTIG;
-
- if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
- (!(state & BMAP_LEFT_CONTIG) ||
- (left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
- state |= BMAP_RIGHT_CONTIG;
-
- /*
- * Switch out based on the contiguity flags.
- */
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with delayed allocations
- * on the left and on the right.
- * Merge all three into a single extent record.
- */
- temp = left.br_blockcount + new->br_blockcount +
- right.br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_startblock = nullstartblock(newlen);
- left.br_blockcount = temp;
-
- xfs_iext_remove(ip, icur, state);
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_LEFT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the left.
- * Merge the new allocation with the left neighbor.
- */
- temp = left.br_blockcount + new->br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_blockcount = temp;
- left.br_startblock = nullstartblock(newlen);
-
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the right.
- * Merge the new allocation with the right neighbor.
- */
- temp = new->br_blockcount + right.br_blockcount;
- oldlen = startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- right.br_startoff = new->br_startoff;
- right.br_startblock = nullstartblock(newlen);
- right.br_blockcount = temp;
- xfs_iext_update_extent(ip, state, icur, &right);
- break;
-
- case 0:
- /*
- * New allocation is not contiguous with another
- * delayed allocation.
- * Insert a new entry.
- */
- oldlen = newlen = 0;
- xfs_iext_insert(ip, icur, new, state);
- break;
- }
- if (oldlen != newlen) {
- ASSERT(oldlen > newlen);
- xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
-
- /*
- * Nothing to do for disk quota accounting here.
- */
- xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
- }
-}
-
-/*
* Convert a hole to a real allocation.
*/
STATIC int /* error */
@@ -4039,144 +3897,6 @@ xfs_bmapi_read(
return 0;
}
-/*
- * Add a delayed allocation extent to an inode. Blocks are reserved from the
- * global pool and the extent inserted into the inode in-core extent tree.
- *
- * On entry, got refers to the first extent beyond the offset of the extent to
- * allocate or eof is specified if no such extent exists. On return, got refers
- * to the extent record that was inserted to the inode fork.
- *
- * Note that the allocated extent may have been merged with contiguous extents
- * during insertion into the inode fork. Thus, got does not reflect the current
- * state of the inode fork on return. If necessary, the caller can use lastx to
- * look up the updated record in the inode fork.
- */
-int
-xfs_bmapi_reserve_delalloc(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t off,
- xfs_filblks_t len,
- xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got,
- struct xfs_iext_cursor *icur,
- int eof)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- xfs_extlen_t alen;
- xfs_extlen_t indlen;
- uint64_t fdblocks;
- int error;
- xfs_fileoff_t aoff;
- bool use_cowextszhint =
- whichfork == XFS_COW_FORK && !prealloc;
-
-retry:
- /*
- * Cap the alloc length. Keep track of prealloc so we know whether to
- * tag the inode before we return.
- */
- aoff = off;
- alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
- if (!eof)
- alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
- if (prealloc && alen >= len)
- prealloc = alen - len;
-
- /*
- * If we're targetting the COW fork but aren't creating a speculative
- * posteof preallocation, try to expand the reservation to align with
- * the COW extent size hint if there's sufficient free space.
- *
- * Unlike the data fork, the CoW cancellation functions will free all
- * the reservations at inactivation, so we don't require that every
- * delalloc reservation have a dirty pagecache.
- */
- if (use_cowextszhint) {
- struct xfs_bmbt_irec prev;
- xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
-
- if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
- prev.br_startoff = NULLFILEOFF;
-
- error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
- 1, 0, &aoff, &alen);
- ASSERT(!error);
- }
-
- /*
- * Make a transaction-less quota reservation for delayed allocation
- * blocks. This number gets adjusted later. We return if we haven't
- * allocated blocks already inside this loop.
- */
- error = xfs_quota_reserve_blkres(ip, alen);
- if (error)
- goto out;
-
- /*
- * Split changing sb for alen and indlen since they could be coming
- * from different places.
- */
- indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- fdblocks = indlen;
- if (XFS_IS_REALTIME_INODE(ip)) {
- error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
- if (error)
- goto out_unreserve_quota;
- } else {
- fdblocks += alen;
- }
-
- error = xfs_dec_fdblocks(mp, fdblocks, false);
- if (error)
- goto out_unreserve_frextents;
-
- ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip, alen, indlen);
-
- got->br_startoff = aoff;
- got->br_startblock = nullstartblock(indlen);
- got->br_blockcount = alen;
- got->br_state = XFS_EXT_NORM;
-
- xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
-
- /*
- * Tag the inode if blocks were preallocated. Note that COW fork
- * preallocation can occur at the start or end of the extent, even when
- * prealloc == 0, so we must also check the aligned offset and length.
- */
- if (whichfork == XFS_DATA_FORK && prealloc)
- xfs_inode_set_eofblocks_tag(ip);
- if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
- xfs_inode_set_cowblocks_tag(ip);
-
- return 0;
-
-out_unreserve_frextents:
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
-out_unreserve_quota:
- if (XFS_IS_QUOTA_ON(mp))
- xfs_quota_unreserve_blkres(ip, alen);
-out:
- if (error == -ENOSPC || error == -EDQUOT) {
- trace_xfs_delalloc_enospc(ip, off, len);
-
- if (prealloc || use_cowextszhint) {
- /* retry without any preallocation */
- use_cowextszhint = false;
- prealloc = 0;
- goto retry;
- }
- }
- return error;
-}
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del)
+ struct xfs_bmbt_irec *del,
+ uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (isrt)
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
- else
+ if (bflags & XFS_BMAPI_REMAP) {
+ ;
+ } else if (isrt) {
+ xfs_rtbxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5670,7 +5399,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 4b721d935994..b4d9c6e0f3f9 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del);
+ struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
@@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
-int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
- int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b1007fb661ba..9566a7623365 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -178,9 +178,10 @@ typedef struct xfs_sb {
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
-
uint8_t sb_rgblklog; /* rt group number shift */
uint8_t sb_pad[7]; /* zeroes */
+ xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
+ xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
__be64 sb_metadirino; /* metadata directory tree root */
__be32 sb_rgcount; /* # of realtime groups */
__be32 sb_rgextents; /* size of rtgroup in rtx */
-
__u8 sb_rgblklog; /* rt group number shift */
__u8 sb_pad[7]; /* zeroes */
+ __be64 sb_rtstart; /* start of internal RT section (FSB) */
+ __be64 sb_rtreserved; /* reserved (zoned) RT blocks */
/*
* The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
+#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
+#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
+
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
- XFS_SB_FEAT_INCOMPAT_METADIR)
+ XFS_SB_FEAT_INCOMPAT_METADIR | \
+ XFS_SB_FEAT_INCOMPAT_ZONED | \
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -952,7 +959,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __be32 di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ __be32 di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ __be32 di_used_blocks;
+ };
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2c3171262b44..12463ba766da 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
uint32_t checked; /* o: checked fs & rt metadata */
__u32 rgextents; /* rt extents in a realtime group */
__u32 rgcount; /* number of realtime groups */
- __u64 reserved[16]; /* reserved space */
+ __u64 rtstart; /* start of internal rt section */
+ __u64 rtreserved; /* RT (zoned) reserved blocks */
+ __u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
+#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
@@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+/*
+ * Devices supported by a single XFS file system. Reported in fsmaps fmr_device
+ * when using internal RT devices.
+ */
+enum xfs_device {
+ XFS_DEV_DATA = 1,
+ XFS_DEV_LOG = 2,
+ XFS_DEV_RT = 3,
+};
#ifndef HAVE_BBMACROS
/*
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index 242b05627c7a..4423932a2313 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -19,10 +19,23 @@ struct xfs_group {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
- /*
- * Track freed but not yet committed extents.
- */
- struct xfs_extent_busy_tree *xg_busy_extents;
+ union {
+ /*
+ * For perags and non-zoned RT groups:
+ * Track freed but not yet committed extents.
+ */
+ struct xfs_extent_busy_tree *xg_busy_extents;
+
+ /*
+ * For zoned RT groups:
+ * List of groups that need a zone reset.
+ *
+ * The zonegc code forces a log flush of the rtrmap inode before
+ * resetting the write pointer, so there is no need for
+ * individual busy extent tracking.
+ */
+ struct xfs_group *xg_next_reset;
+ };
/*
* Bitsets of per-ag metadata that have been checked and/or are sick.
@@ -107,9 +120,15 @@ xfs_gbno_to_daddr(
xfs_agblock_t gbno)
{
struct xfs_mount *mp = xg->xg_mount;
- uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
+ struct xfs_groups *g = &mp->m_groups[xg->xg_type];
+ xfs_fsblock_t fsbno;
+
+ if (g->has_daddr_gaps)
+ fsbno = xfs_gbno_to_fsb(xg, gbno);
+ else
+ fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
- return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
+ return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
static inline uint32_t
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f3a840a425f5..0c47b5c6ca7d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -364,7 +364,7 @@ xfs_ialloc_inode_init(
(j * M_IGEO(mp)->blocks_per_cluster));
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
- XBF_UNMAPPED, &fbuf);
+ 0, &fbuf);
if (error)
return error;
@@ -1927,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
- if (percpu_counter_read_positive(&mp->m_fdblocks) <
+ if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f24fa628fecf..aa13fc00afd7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -137,7 +137,7 @@ xfs_imap_to_bp(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ imap->im_len, 0, bpp, &xfs_inode_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
XFS_SICK_AG_INODES);
@@ -252,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
+ sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -752,11 +756,18 @@ xfs_dinode_verify(
!xfs_has_rtreflink(mp))
return __this_address;
- /* COW extent size hint validation */
- fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
- mode, flags, flags2);
- if (fa)
- return fa;
+ if (xfs_has_zoned(mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
+ if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
+ return __this_address;
+ } else {
+ /* COW extent size hint validation */
+ fa = xfs_inode_validate_cowextsize(mp,
+ be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index deb0b7c00a1f..48fe49a5f050 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -322,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a472ac2e45d0..0d637c276db0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -475,7 +475,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
- uint32_t di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ uint32_t di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ uint32_t di_used_blocks;
+ };
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 2f5f554a36d4..225923e463c4 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -21,6 +21,9 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
static const struct {
enum xfs_metafile_type mtype;
@@ -74,12 +77,11 @@ xfs_metafile_clear_iflag(
}
/*
- * Is the amount of space that could be allocated towards a given metadata
- * file at or beneath a certain threshold?
+ * Is the metafile reservations at or beneath a certain threshold?
*/
static inline bool
xfs_metafile_resv_can_cover(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
int64_t rhs)
{
/*
@@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover(
* global free block count. Take care of the first case to avoid
* touching the per-cpu counter.
*/
- if (ip->i_delayed_blks >= rhs)
+ if (mp->m_metafile_resv_avail >= rhs)
return true;
/*
* There aren't enough blocks left in the inode's reservation, but it
* isn't critical unless there also isn't enough free space.
*/
- return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
- rhs - ip->i_delayed_blks, 2048) >= 0;
+ return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
+ rhs - mp->m_metafile_resv_avail, 2048) >= 0;
}
/*
- * Is this metadata file critically low on blocks? For now we'll define that
- * as the number of blocks we can get our hands on being less than 10% of what
- * we reserved or less than some arbitrary number (maximum btree height).
+ * Is the metafile reservation critically low on blocks? For now we'll define
+ * that as the number of blocks we can get our hands on being less than 10% of
+ * what we reserved or less than some arbitrary number (maximum btree height).
*/
bool
xfs_metafile_resv_critical(
- struct xfs_inode *ip)
+ struct xfs_mount *mp)
{
- uint64_t asked_low_water;
+ ASSERT(xfs_has_metadir(mp));
- if (!ip)
- return false;
-
- ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_critical(ip, 0);
+ trace_xfs_metafile_resv_critical(mp, 0);
- if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+ if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
return true;
- asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
- if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
+ if (!xfs_metafile_resv_can_cover(mp,
+ div_u64(mp->m_metafile_resv_target, 10)))
return true;
- return XFS_TEST_ERROR(false, ip->i_mount,
- XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
@@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space(
struct xfs_inode *ip,
struct xfs_alloc_arg *args)
{
+ struct xfs_mount *mp = ip->i_mount;
int64_t len = args->len;
ASSERT(xfs_is_metadir_inode(ip));
ASSERT(args->resv == XFS_AG_RESV_METAFILE);
- trace_xfs_metafile_resv_alloc_space(ip, args->len);
+ trace_xfs_metafile_resv_alloc_space(mp, args->len);
/*
* Allocate the blocks from the metadata inode's block reservation
* and update the ondisk sb counter.
*/
- if (ip->i_delayed_blks > 0) {
+ mutex_lock(&mp->m_metafile_resv_lock);
+ if (mp->m_metafile_resv_avail > 0) {
int64_t from_resv;
- from_resv = min_t(int64_t, len, ip->i_delayed_blks);
- ip->i_delayed_blks -= from_resv;
+ from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
+ mp->m_metafile_resv_avail -= from_resv;
xfs_mod_delalloc(ip, 0, -from_resv);
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
-from_resv);
@@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space(
xfs_trans_mod_sb(args->tp, field, -len);
}
+ mp->m_metafile_resv_used += args->len;
+ mutex_unlock(&mp->m_metafile_resv_lock);
+
ip->i_nblocks += args->len;
xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
}
@@ -186,26 +188,33 @@ xfs_metafile_resv_free_space(
struct xfs_trans *tp,
xfs_filblks_t len)
{
+ struct xfs_mount *mp = ip->i_mount;
int64_t to_resv;
ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_free_space(ip, len);
+
+ trace_xfs_metafile_resv_free_space(mp, len);
ip->i_nblocks -= len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ mutex_lock(&mp->m_metafile_resv_lock);
+ mp->m_metafile_resv_used -= len;
+
/*
* Add the freed blocks back into the inode's delalloc reservation
* until it reaches the maximum size. Update the ondisk fdblocks only.
*/
- to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ to_resv = mp->m_metafile_resv_target -
+ (mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
if (to_resv > 0) {
to_resv = min_t(int64_t, to_resv, len);
- ip->i_delayed_blks += to_resv;
+ mp->m_metafile_resv_avail += to_resv;
xfs_mod_delalloc(ip, 0, to_resv);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
len -= to_resv;
}
+ mutex_unlock(&mp->m_metafile_resv_lock);
/*
* Everything else goes back to the filesystem, so update the in-core
@@ -215,61 +224,99 @@ xfs_metafile_resv_free_space(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
}
-/* Release a metadata file's space reservation. */
+static void
+__xfs_metafile_resv_free(
+ struct xfs_mount *mp)
+{
+ if (mp->m_metafile_resv_avail) {
+ xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
+ xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
+ }
+ mp->m_metafile_resv_avail = 0;
+ mp->m_metafile_resv_used = 0;
+ mp->m_metafile_resv_target = 0;
+}
+
+/* Release unused metafile space reservation. */
void
xfs_metafile_resv_free(
- struct xfs_inode *ip)
+ struct xfs_mount *mp)
{
- /* Non-btree metadata inodes don't need space reservations. */
- if (!ip || !ip->i_meta_resv_asked)
+ if (!xfs_has_metadir(mp))
return;
- ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_free(ip, 0);
+ trace_xfs_metafile_resv_free(mp, 0);
- if (ip->i_delayed_blks) {
- xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
- xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
- ip->i_delayed_blks = 0;
- }
- ip->i_meta_resv_asked = 0;
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+ mutex_unlock(&mp->m_metafile_resv_lock);
}
-/* Set up a metadata file's space reservation. */
+/* Set up a metafile space reservation. */
int
xfs_metafile_resv_init(
- struct xfs_inode *ip,
- xfs_filblks_t ask)
+ struct xfs_mount *mp)
{
+ struct xfs_rtgroup *rtg = NULL;
+ xfs_filblks_t used = 0, target = 0;
xfs_filblks_t hidden_space;
- xfs_filblks_t used;
- int error;
+ xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
+ int error = 0;
- if (!ip || ip->i_meta_resv_asked > 0)
+ if (!xfs_has_metadir(mp))
return 0;
- ASSERT(xfs_is_metadir_inode(ip));
+ /*
+ * Free any previous reservation to have a clean slate.
+ */
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+
+ /*
+ * Currently the only btree metafiles that require reservations are the
+ * rtrmap and the rtrefcount. Anything new will have to be added here
+ * as well.
+ */
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ if (xfs_has_rtrmapbt(mp)) {
+ used += rtg_rmap(rtg)->i_nblocks;
+ target += xfs_rtrmapbt_calc_reserves(mp);
+ }
+ if (xfs_has_rtreflink(mp)) {
+ used += rtg_refcount(rtg)->i_nblocks;
+ target += xfs_rtrefcountbt_calc_reserves(mp);
+ }
+ }
+
+ if (!target)
+ goto out_unlock;
/*
- * Space taken by all other metadata btrees are accounted on-disk as
+ * Space taken by the per-AG metadata btrees are accounted on-disk as
* used space. We therefore only hide the space that is reserved but
* not used by the trees.
*/
- used = ip->i_nblocks;
- if (used > ask)
- ask = used;
- hidden_space = ask - used;
+ if (used > target)
+ target = used;
+ else if (target > dblocks_avail)
+ target = dblocks_avail;
+ hidden_space = target - used;
- error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
+ error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
- trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
- return error;
+ trace_xfs_metafile_resv_init_error(mp, 0);
+ goto out_unlock;
}
- xfs_mod_delalloc(ip, 0, hidden_space);
- ip->i_delayed_blks = hidden_space;
- ip->i_meta_resv_asked = ask;
+ xfs_mod_sb_delalloc(mp, hidden_space);
+
+ mp->m_metafile_resv_target = target;
+ mp->m_metafile_resv_used = used;
+ mp->m_metafile_resv_avail = hidden_space;
+
+ trace_xfs_metafile_resv_init(mp, target);
- trace_xfs_metafile_resv_init(ip, ask);
- return 0;
+out_unlock:
+ mutex_unlock(&mp->m_metafile_resv_lock);
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
index 95af4b52e5a7..ae6f9e779b98 100644
--- a/fs/xfs/libxfs/xfs_metafile.h
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
/* Space reservations for metadata inodes. */
struct xfs_alloc_arg;
-bool xfs_metafile_resv_critical(struct xfs_inode *ip);
+bool xfs_metafile_resv_critical(struct xfs_mount *mp);
void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
struct xfs_alloc_arg *args);
void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
xfs_filblks_t len);
-void xfs_metafile_resv_free(struct xfs_inode *ip);
-int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
+void xfs_metafile_resv_free(struct xfs_mount *mp);
+int xfs_metafile_resv_init(struct xfs_mount *mp);
/* Code specific to kernel/userspace; must be provided externally. */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index a85ecddaa48e..5ed44fdf7491 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
16299260424LL);
/* superblock field checks we got from xfs/122 */
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
- XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
XFS_CHECK_SB_OFFSET(sb_pad, 281);
+ XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
+ XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 770adf60dd73..5057536e586c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
xfs_extlen_t mod;
int error;
+ ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
end = min(end, rtg->rtg_extents - 1);
+ if (xfs_has_zoned(mp))
+ return -EINVAL;
+
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
@@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
+ if (xfs_has_zoned(mp))
+ return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
@@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
+ if (xfs_has_zoned(mp)) {
+ *rsumlevels = 0;
+ return 0;
+ }
+
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index d84d32f1b48f..9186c58e83d5 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -194,15 +194,17 @@ xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- /*
- * Lock both realtime free space metadata inodes for a freespace
- * update.
- */
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ /*
+ * Lock both realtime free space metadata inodes for a
+ * freespace update.
+ */
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
@@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
}
@@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ if (!xfs_has_zoned(rtg_mount(rtg)) &&
+ (rtglock_flags & XFS_RTGLOCK_BITMAP)) {
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
@@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry(
/* Fill out form. */
memset(rgeo, 0, sizeof(*rgeo));
rgeo->rg_number = rtg_rgno(rtg);
- rgeo->rg_length = rtg_group(rtg)->xg_block_count;
+ rgeo->rg_length = rtg_blocks(rtg);
xfs_rtgroup_geom_health(rtg, rgeo);
return 0;
}
@@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
@@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 03f39d4e43fc..d36a6ae0abe5 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -37,15 +37,33 @@ struct xfs_rtgroup {
xfs_rtxnum_t rtg_extents;
/*
- * Cache of rt summary level per bitmap block with the invariant that
- * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
- * or 0 if rsum[i][bbno] == 0 for all i.
- *
+ * For bitmap based RT devices this points to a cache of rt summary
+ * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+ * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+ * rsum[i][bbno] == 0 for all i.
* Reads and writes are serialized by the rsumip inode lock.
+ *
+ * For zoned RT devices this points to the open zone structure for
+ * a group that is open for writers, or is NULL.
*/
- uint8_t *rtg_rsum_cache;
+ union {
+ uint8_t *rtg_rsum_cache;
+ struct xfs_open_zone *rtg_open_zone;
+ };
};
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE XA_MARK_0
+
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks. Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE XA_MARK_1
+
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
@@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
+static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_block_count;
+}
+
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
@@ -222,10 +245,14 @@ xfs_rtb_to_daddr(
xfs_rtblock_t rtbno)
{
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
- xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
- uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
- return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+
+ rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
+ }
+
+ return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
}
static inline xfs_rtblock_t
@@ -233,10 +260,11 @@ xfs_daddr_to_rtb(
struct xfs_mount *mp,
xfs_daddr_t daddr)
{
- xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rfsblock_t bno;
- if (xfs_has_rtgroups(mp)) {
- struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno;
uint32_t rgbno;
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index e4ec36943cb7..9bdc2cbfc113 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb(
xfs_btree_del_cursor(cur, error);
return error;
}
+
+/*
+ * Return the highest rgbno currently tracked by the rmap for this rtg.
+ */
+xfs_rgblock_t
+xfs_rtrmap_highest_rgbno(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
+ union xfs_btree_key key = {};
+ struct xfs_btree_cur *cur;
+
+ if (block->bb_numrecs == 0)
+ return NULLRGBLOCK;
+ cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_get_keys(cur, block, &key);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 9d0915089891..e328fd62a149 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
+
#endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3dc5f5dba162..711e180f9ebb 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -185,6 +185,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
+ features |= XFS_FEAT_ZONED;
return features;
}
@@ -266,6 +268,9 @@ static uint64_t
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+ return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
@@ -275,9 +280,15 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
- if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
- sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
- return false;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
+ if (sbp->sb_rextsize != 1)
+ return false;
+ } else {
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
+ return false;
+ }
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups(
return 0;
}
+static int
+xfs_validate_sb_zoned(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ if (sbp->sb_frextents != 0) {
+ xfs_warn(mp,
+"sb_frextents must be zero for zoned file systems.");
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
+ xfs_warn(mp,
+"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
+ sbp->sb_rtstart, sbp->sb_dblocks);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
+ xfs_warn(mp,
+"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
+ sbp->sb_rtreserved, sbp->sb_rblocks);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@@ -523,6 +562,11 @@ xfs_validate_sb_common(
if (error)
return error;
}
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ error = xfs_validate_sb_zoned(mp, sbp);
+ if (error)
+ return error;
+ }
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
@@ -835,6 +879,14 @@ __xfs_sb_from_disk(
to->sb_rgcount = 1;
to->sb_rgextents = 0;
}
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
+ to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
+ } else {
+ to->sb_rtstart = 0;
+ to->sb_rtreserved = 0;
+ }
}
void
@@ -1001,6 +1053,11 @@ xfs_sb_to_disk(
to->sb_rbmino = cpu_to_be64(0);
to->sb_rsumino = cpu_to_be64(0);
}
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
+ to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
+ }
}
/*
@@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize(
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
+ rgs->start_fsb = mp->m_sb.sb_rtstart;
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
+ rgs->has_daddr_gaps = true;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
@@ -1265,8 +1326,7 @@ xfs_log_sb(
mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks =
- percpu_counter_sum_positive(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
}
/*
@@ -1275,9 +1335,10 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
mp->m_sb.sb_frextents =
- percpu_counter_sum_positive(&mp->m_frextents);
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
+ }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
@@ -1510,6 +1571,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
if (xfs_has_metadir(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
+ if (xfs_has_zoned(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1530,6 +1593,10 @@ xfs_fs_geometry(
geo->rgcount = sbp->sb_rgcount;
geo->rgextents = sbp->sb_rgextents;
}
+ if (xfs_has_zoned(mp)) {
+ geo->rtstart = sbp->sb_rtstart;
+ geo->rtreserved = sbp->sb_rtreserved;
+ }
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index ca2401c1facd..f6f4f2d4b5db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -233,6 +233,34 @@ enum xfs_group_type {
{ XG_TYPE_AG, "ag" }, \
{ XG_TYPE_RTG, "rtg" }
+enum xfs_free_counter {
+ /*
+ * Number of free blocks on the data device.
+ */
+ XC_FREE_BLOCKS,
+
+ /*
+ * Number of free RT extents on the RT device.
+ */
+ XC_FREE_RTEXTENTS,
+
+ /*
+ * Number of available for use RT extents.
+ *
+ * This counter only exists for zoned RT device and indicates the number
+ * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
+ * also includes blocks that have been written previously and freed, but
+ * sit in a rtgroup that still needs a zone reset.
+ */
+ XC_FREE_RTAVAILABLE,
+ XC_FREE_NR,
+};
+
+#define XFS_FREECOUNTER_STR \
+ { XC_FREE_BLOCKS, "blocks" }, \
+ { XC_FREE_RTEXTENTS, "rtextents" }, \
+ { XC_FREE_RTAVAILABLE, "rtavailable" }
+
/*
* Type verifier functions
*/
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
new file mode 100644
index 000000000000..b0791a71931c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zones.h"
+
+static bool
+xfs_zone_validate_empty(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > 0) {
+ xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = 0;
+ return true;
+}
+
+static bool
+xfs_zone_validate_wp(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+ xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
+ rtg_rgno(rtg), wp_fsb);
+ return false;
+ }
+
+ *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+ if (*write_pointer >= rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
+ rtg_rgno(rtg), *write_pointer);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+xfs_zone_validate_full(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = rtg->rtg_extents;
+ return true;
+}
+
+static bool
+xfs_zone_validate_seq(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ return xfs_zone_validate_empty(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return xfs_zone_validate_wp(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_FULL:
+ return xfs_zone_validate_full(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ default:
+ xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+static bool
+xfs_zone_validate_conv(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ return true;
+ default:
+ xfs_warn(mp,
+"conventional zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+bool
+xfs_zone_validate(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ uint32_t expected_size;
+
+ /*
+ * Check that the zone capacity matches the rtgroup size stored in the
+ * superblock. Note that all zones including the last one must have a
+ * uniform capacity.
+ */
+ if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+ xfs_warn(mp,
+"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+ g->blocks);
+ return false;
+ }
+
+ if (g->has_daddr_gaps) {
+ expected_size = 1 << g->blklog;
+ } else {
+ if (zone->len != zone->capacity) {
+ xfs_warn(mp,
+"zone %u has capacity != size ((0x%llx vs 0x%llx)",
+ rtg_rgno(rtg),
+ XFS_BB_TO_FSB(mp, zone->len),
+ XFS_BB_TO_FSB(mp, zone->capacity));
+ return false;
+ }
+ expected_size = g->blocks;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
+ xfs_warn(mp,
+"zone %u length (0x%llx) does match geometry (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+ expected_size);
+ }
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ return xfs_zone_validate_conv(zone, rtg);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ return xfs_zone_validate_seq(zone, rtg, write_pointer);
+ default:
+ xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
+ rtg_rgno(rtg), zone->type);
+ return false;
+ }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
new file mode 100644
index 000000000000..c4f1367b2cca
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+struct xfs_rtgroup;
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones: one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeping.
+ */
+#define XFS_GC_ZONES 3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES 1U
+#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+
+bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer);
+
+#endif /* _LIBXFS_ZONES_H */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 9f8c312dfd3c..303374df44bd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -69,6 +69,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
+ if (xfs_has_zoned(mp))
+ return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 66da7d4d56ba..4f1e2574660d 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1038,8 +1038,8 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink filesystems. */
- if (!xfs_has_reflink(mp)) {
+ /* No CoW forks filesystem doesn't support out of place writes */
+ if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ca23cf4db6c5..e629663e460a 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -350,7 +350,7 @@ retry:
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
- if (!xfs_has_realtime(mp))
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -513,8 +519,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
+ fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -589,15 +595,17 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS].count,
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index cda13447a373..f0d2b04644e4 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
- percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
- percpu_counter_set(&mp->m_frextents,
- fsc->frextents - fsc->frextents_delayed);
- if (!xfs_has_rtgroups(mp))
- mp->m_sb.sb_frextents = fsc->frextents;
+ if (!xfs_has_zoned(mp)) {
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ fsc->frextents - fsc->frextents_delayed);
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
+ }
return 0;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index db6edd5a5fe5..bb3f475b6353 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
+ /*
+ * The used block counter for rtrmap is checked and repaired elsewhere.
+ */
+ if (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
+ return;
+
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 13ff1c933cb8..a90a011c7e5f 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
- if (dip->di_version < 3)
+ if (dip->di_version < 3 ||
+ (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
@@ -1558,8 +1560,7 @@ xrep_dinode_core(
/* Read the inode cluster buffer. */
error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
- ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
- NULL);
+ ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
if (error)
return error;
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ac38f5843090..1588ce971cb8 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
- free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index c287c755f2c5..3537f3cca6d5 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -167,10 +167,11 @@ xrep_orphanage_create(
* directory to control access to a file we put in here.
*/
if (d_really_is_negative(orphanage_dentry)) {
- error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
- 0750);
- if (error)
- goto out_dput_orphanage;
+ orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
+ orphanage_dentry, 0750);
+ error = PTR_ERR(orphanage_dentry);
+ if (IS_ERR(orphanage_dentry))
+ goto out_unlock_root;
}
/* Not a directory? Bail out. */
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index b32fb233cf84..8703897c0a9c 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
if (error)
return error;
- if (xreap_dirty(&rs))
- return xrep_defer_finish(sc);
+ if (xreap_dirty(&rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
- return 0;
+ return xrep_reset_metafile_resv(sc);
}
/*
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 3b5288d3ef4e..f8f9ed30f56b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
- int error;
+ int error = 0;
+
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
@@ -1386,11 +1393,12 @@ int
xrep_reset_metafile_resv(
struct xfs_scrub *sc)
{
- struct xfs_inode *ip = sc->ip;
+ struct xfs_mount *mp = sc->mp;
int64_t delta;
int error;
- delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+ delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
+ mp->m_metafile_resv_target;
if (delta == 0)
return 0;
@@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv(
if (delta > 0) {
int64_t give_back;
- give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+ give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
if (give_back > 0) {
- xfs_mod_delalloc(ip, 0, -give_back);
- xfs_add_fdblocks(ip->i_mount, give_back);
- ip->i_delayed_blks -= give_back;
+ xfs_mod_sb_delalloc(mp, -give_back);
+ xfs_add_fdblocks(mp, give_back);
+ mp->m_metafile_resv_avail -= give_back;
}
return 0;
@@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv(
/*
* Not enough reservation; try to take some blocks from the filesystem
- * to the metadata inode. @delta is negative here, so invert the sign.
+ * to the metabtree reservation.
*/
- delta = -delta;
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ delta = -delta; /* delta is negative here, so invert the sign. */
+ error = xfs_dec_fdblocks(mp, delta, true);
while (error == -ENOSPC) {
delta--;
if (delta == 0) {
xfs_warn(sc->mp,
-"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
- ip->i_ino);
+"Insufficient free space to reset metabtree reservation after repair.");
return 0;
}
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ error = xfs_dec_fdblocks(mp, delta, true);
}
if (error)
return error;
- xfs_mod_delalloc(ip, 0, delta);
- ip->i_delayed_blks += delta;
+ xfs_mod_sb_delalloc(mp, delta);
+ mp->m_metafile_resv_avail += delta;
return 0;
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e8c776a34c1d..d5ff8609dbfb 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
+ if (xfs_has_zoned(sc->mp)) {
+ if (!xfs_zone_rgbno_is_valid(rtg,
+ xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
+ xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
+ return;
+ }
+
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
index 257cfb24beb4..983362447826 100644
--- a/fs/xfs/scrub/rtrefcount_repair.c
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -697,32 +697,6 @@ err_cur:
return error;
}
-/*
- * Now that we've logged the roots of the new btrees, invalidate all of the
- * old blocks and free them.
- */
-STATIC int
-xrep_rtrefc_remove_old_tree(
- struct xrep_rtrefc *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrefcountbt
- * and aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc,
- &rr->old_rtrefcountbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrefcount inode so that we
- * don't fail to expand the btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
/* Rebuild the rt refcount btree. */
int
xrep_rtrefcountbt(
@@ -769,8 +743,12 @@ xrep_rtrefcountbt(
if (error)
goto out_bitmap;
- /* Kill the old tree. */
- error = xrep_rtrefc_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
if (error)
goto out_bitmap;
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index f2fdd7a9fc24..fc2592c53af5 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -810,28 +810,6 @@ err_cur:
/* Reaping the old btree. */
-/* Reap the old rtrmapbt blocks. */
-STATIC int
-xrep_rtrmap_remove_old_tree(
- struct xrep_rtrmap *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrmapbt and
- * aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrmap inode so that we don't
- * fail to expand the new btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
static inline bool
xrep_rtrmapbt_want_live_update(
struct xchk_iscan *iscan,
@@ -995,8 +973,11 @@ xrep_rtrmapbt(
if (error)
goto out_records;
- /* Kill the old tree. */
- error = xrep_rtrmap_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrmapbt and
+ * aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
goto out_records;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 6fa9e3e5bab7..9908850bf76f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6d9965b546cb..26a04a783489 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -20,6 +20,8 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -77,6 +79,26 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
+static void
+xfs_ioend_put_open_zones(
+ struct iomap_ioend *ioend)
+{
+ struct iomap_ioend *tmp;
+
+ /*
+ * Put the open zone for all ioends merged into this one (if any).
+ */
+ list_for_each_entry(tmp, &ioend->io_list, io_list)
+ xfs_open_zone_put(tmp->io_private);
+
+ /*
+ * The main ioend might not have an open zone if the submission failed
+ * before xfs_zone_alloc_and_submit got called.
+ */
+ if (ioend->io_private)
+ xfs_open_zone_put(ioend->io_private);
+}
+
/*
* IO write completion.
*/
@@ -86,6 +108,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -115,10 +138,11 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
+ ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
- offset + size);
+ offset + size, NULL);
}
goto done;
}
@@ -126,14 +150,21 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (is_zoned)
+ error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+ ioend->io_private, NULLFSBLOCK);
+ else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- if (!error && xfs_ioend_is_append(ioend))
+ if (!error &&
+ !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
+ xfs_ioend_is_append(ioend))
error = xfs_setfilesize(ip, offset, size);
done:
+ if (is_zoned)
+ xfs_ioend_put_open_zones(ioend);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@@ -176,17 +207,27 @@ xfs_end_io(
}
}
-STATIC void
+void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
+ /*
+ * For Appends record the actually written block number and set the
+ * boundary flag if needed.
+ */
+ if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ xfs_mark_rtg_boundary(ioend);
+ }
+
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -396,10 +437,11 @@ allocate_blocks:
}
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
+xfs_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
int status)
{
+ struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -410,7 +452,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+ if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -418,10 +460,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ if (xfs_ioend_is_append(ioend) ||
+ (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ if (status)
+ return status;
+ submit_bio(&ioend->io_bio);
+ return 0;
}
/*
@@ -458,12 +504,107 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio));
+ folio_pos(folio) + folio_size(folio), NULL);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
+ .submit_ioend = xfs_submit_ioend,
+ .discard_folio = xfs_discard_folio,
+};
+
+struct xfs_zoned_writepage_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct xfs_open_zone *open_zone;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
+{
+ return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+ struct iomap_writepage_ctx *wpc,
+ struct inode *inode,
+ loff_t offset,
+ unsigned int len)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_filblks_t count_fsb;
+ struct xfs_bmbt_irec imap, del;
+ struct xfs_iext_cursor icur;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
+ /*
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
+ *
+ * This implies we never map outside folios that are locked or marked
+ * as under writeback, and thus there is no need check the fork sequence
+ * count here.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+ imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ if (imap.br_startoff > offset_fsb) {
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+ return 0;
+ }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
+
+ del = imap;
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+ XFS_BMAPI_REMAP);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ANON_WRITE;
+
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
+}
+
+static int
+xfs_zoned_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
+ int status)
+{
+ wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (status)
+ return status;
+ xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
+ return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+ .map_blocks = xfs_zoned_map_blocks,
+ .submit_ioend = xfs_zoned_submit_ioend,
.discard_folio = xfs_discard_folio,
};
@@ -472,10 +613,25 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = { };
+ struct xfs_inode *ip = XFS_I(mapping->host);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
+ if (xfs_is_zoned_inode(ip)) {
+ struct xfs_zoned_writepage_ctx xc = { };
+ int error;
+
+ error = iomap_writepages(mapping, wbc, &xc.ctx,
+ &xfs_zoned_writeback_ops);
+ if (xc.open_zone)
+ xfs_open_zone_put(xc.open_zone);
+ return error;
+ } else {
+ struct xfs_writepage_ctx wpc = { };
+
+ return iomap_writepages(mapping, wbc, &wpc.ctx,
+ &xfs_writeback_ops);
+ }
}
STATIC int
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e0bd68419764..5a7a0f1a0b49 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,6 +9,7 @@
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
-int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0836fea2d6d8..06ca11731e43 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
- xfs_off_t end_byte)
+ xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ if (xfs_is_zoned_inode(ip) && ac) {
+ /*
+ * In a zoned buffered write context we need to return
+ * the punched delalloc allocations to the allocation
+ * context. This allows reusing them in the following
+ * iomap iterations.
+ */
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, XFS_BMAPI_REMAP);
+ ac->reserved_blocks += del.br_blockcount;
+ } else {
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, 0);
+ }
+
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -582,7 +598,7 @@ xfs_free_eofblocks(
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
- LLONG_MAX);
+ LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -825,7 +841,8 @@ int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
@@ -880,7 +897,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = xfs_zero_range(ip, offset, len, NULL);
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
@@ -968,7 +985,8 @@ int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -981,7 +999,7 @@ xfs_collapse_file_space(
trace_xfs_collapse_file_space(ip);
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index b29760d36e1a..c477b3361630 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
- xfs_off_t start_byte, xfs_off_t end_byte);
+ xfs_off_t start_byte, xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
@@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5d560e9073f4..8e7f1b324b3b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -55,27 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
}
-static inline int
-xfs_buf_is_vmapped(
- struct xfs_buf *bp)
-{
- /*
- * Return true if the buffer is vmapped.
- *
- * b_addr is null if the buffer is not mapped, but the code is clever
- * enough to know it doesn't have to map a single page, so the check has
- * to be both for b_addr and bp->b_page_count > 1.
- */
- return bp->b_addr && bp->b_page_count > 1;
-}
-
-static inline int
-xfs_buf_vmap_len(
- struct xfs_buf *bp)
-{
- return (bp->b_page_count * PAGE_SIZE);
-}
-
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -109,38 +88,168 @@ xfs_buf_stale(
spin_unlock(&bp->b_lock);
}
+static void
+xfs_buf_free_callback(
+ struct callback_head *cb)
+{
+ struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
+
+ if (bp->b_maps != &bp->__b_map)
+ kfree(bp->b_maps);
+ kmem_cache_free(xfs_buf_cache, bp);
+}
+
+static void
+xfs_buf_free(
+ struct xfs_buf *bp)
+{
+ unsigned int size = BBTOB(bp->b_length);
+
+ trace_xfs_buf_free(bp, _RET_IP_);
+
+ ASSERT(list_empty(&bp->b_lru));
+
+ if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+ mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
+ if (is_vmalloc_addr(bp->b_addr))
+ vfree(bp->b_addr);
+ else if (bp->b_flags & _XBF_KMEM)
+ kfree(bp->b_addr);
+ else
+ folio_put(virt_to_folio(bp->b_addr));
+
+ call_rcu(&bp->b_rcu, xfs_buf_free_callback);
+}
+
static int
-xfs_buf_get_maps(
+xfs_buf_alloc_kmem(
struct xfs_buf *bp,
- int map_count)
+ size_t size,
+ gfp_t gfp_mask)
{
- ASSERT(bp->b_maps == NULL);
- bp->b_map_count = map_count;
+ ASSERT(is_power_of_2(size));
+ ASSERT(size < PAGE_SIZE);
- if (map_count == 1) {
- bp->b_maps = &bp->__b_map;
- return 0;
- }
+ bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
+ if (!bp->b_addr)
+ return -ENOMEM;
- bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
- GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
- if (!bp->b_maps)
+ /*
+ * Slab guarantees that we get back naturally aligned allocations for
+ * power of two sizes. Keep this check as the canary in the coal mine
+ * if anything changes in slab.
+ */
+ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
+ kfree(bp->b_addr);
+ bp->b_addr = NULL;
return -ENOMEM;
+ }
+ bp->b_flags |= _XBF_KMEM;
+ trace_xfs_buf_backing_kmem(bp, _RET_IP_);
return 0;
}
-static void
-xfs_buf_free_maps(
- struct xfs_buf *bp)
+/*
+ * Allocate backing memory for a buffer.
+ *
+ * For tmpfs-backed buffers used by in-memory btrees this directly maps the
+ * tmpfs page cache folios.
+ *
+ * For real file system buffers there are three different kinds backing memory:
+ *
+ * The first type backs the buffer by a kmalloc allocation. This is done for
+ * less than PAGE_SIZE allocations to avoid wasting memory.
+ *
+ * The second type is a single folio buffer - this may be a high order folio or
+ * just a single page sized folio, but either way they get treated the same way
+ * by the rest of the code - the buffer memory spans a single contiguous memory
+ * region that we don't have to map and unmap to access the data directly.
+ *
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
+ */
+static int
+xfs_buf_alloc_backing_mem(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
{
- if (bp->b_maps != &bp->__b_map) {
- kfree(bp->b_maps);
- bp->b_maps = NULL;
+ size_t size = BBTOB(bp->b_length);
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
+ struct folio *folio;
+
+ if (xfs_buftarg_is_mem(bp->b_target))
+ return xmbuf_map_backing_mem(bp);
+
+ /* Assure zeroed buffer for non-read cases. */
+ if (!(flags & XBF_READ))
+ gfp_mask |= __GFP_ZERO;
+
+ if (flags & XBF_READ_AHEAD)
+ gfp_mask |= __GFP_NORETRY;
+
+ /*
+ * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
+ * is properly aligned. The slab allocator now guarantees an aligned
+ * allocation for all power of two sizes, which matches most of the
+ * smaller than PAGE_SIZE buffers used by XFS.
+ */
+ if (size < PAGE_SIZE && is_power_of_2(size))
+ return xfs_buf_alloc_kmem(bp, size, gfp_mask);
+
+ /*
+ * Don't bother with the retry loop for single PAGE allocations: vmalloc
+ * won't do any better.
+ */
+ if (size <= PAGE_SIZE)
+ gfp_mask |= __GFP_NOFAIL;
+
+ /*
+ * Optimistically attempt a single high order folio allocation for
+ * larger than PAGE_SIZE buffers.
+ *
+ * Allocating a high order folio makes the assumption that buffers are a
+ * power-of-2 size, matching the power-of-2 folios sizes available.
+ *
+ * The exception here are user xattr data buffers, which can be arbitrarily
+ * sized up to 64kB plus structure metadata, skip straight to the vmalloc
+ * path for them instead of wasting memory here.
+ */
+ if (size > PAGE_SIZE) {
+ if (!is_power_of_2(size))
+ goto fallback;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+ gfp_mask |= __GFP_NORETRY;
}
+ folio = folio_alloc(gfp_mask, get_order(size));
+ if (!folio) {
+ if (size <= PAGE_SIZE)
+ return -ENOMEM;
+ trace_xfs_buf_backing_fallback(bp, _RET_IP_);
+ goto fallback;
+ }
+ bp->b_addr = folio_address(folio);
+ trace_xfs_buf_backing_folio(bp, _RET_IP_);
+ return 0;
+
+fallback:
+ for (;;) {
+ bp->b_addr = __vmalloc(size, gfp_mask);
+ if (bp->b_addr)
+ break;
+ if (flags & XBF_READ_AHEAD)
+ return -ENOMEM;
+ XFS_STATS_INC(bp->b_mount, xb_page_retries);
+ memalloc_retry_wait(gfp_mask);
+ }
+
+ trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
+ return 0;
}
static int
-_xfs_buf_alloc(
+xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
@@ -159,7 +268,7 @@ _xfs_buf_alloc(
* We don't want certain flags to appear in b_flags unless they are
* specifically set by later operations on the buffer.
*/
- flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+ flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
/*
* A new buffer is held and locked by the owner. This ensures that the
@@ -179,15 +288,14 @@ _xfs_buf_alloc(
bp->b_target = target;
bp->b_mount = target->bt_mount;
bp->b_flags = flags;
-
- error = xfs_buf_get_maps(bp, nmaps);
- if (error) {
- kmem_cache_free(xfs_buf_cache, bp);
- return error;
- }
-
bp->b_rhash_key = map[0].bm_bn;
bp->b_length = 0;
+ bp->b_map_count = nmaps;
+ if (nmaps == 1)
+ bp->b_maps = &bp->__b_map;
+ else
+ bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
for (i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
bp->b_maps[i].bm_len = map[i].bm_len;
@@ -200,195 +308,13 @@ _xfs_buf_alloc(
XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
- *bpp = bp;
- return 0;
-}
-
-static void
-xfs_buf_free_pages(
- struct xfs_buf *bp)
-{
- uint i;
-
- ASSERT(bp->b_flags & _XBF_PAGES);
-
- if (xfs_buf_is_vmapped(bp))
- vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
- for (i = 0; i < bp->b_page_count; i++) {
- if (bp->b_pages[i])
- __free_page(bp->b_pages[i]);
- }
- mm_account_reclaimed_pages(bp->b_page_count);
-
- if (bp->b_pages != bp->b_page_array)
- kfree(bp->b_pages);
- bp->b_pages = NULL;
- bp->b_flags &= ~_XBF_PAGES;
-}
-
-static void
-xfs_buf_free_callback(
- struct callback_head *cb)
-{
- struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
-
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_cache, bp);
-}
-
-static void
-xfs_buf_free(
- struct xfs_buf *bp)
-{
- trace_xfs_buf_free(bp, _RET_IP_);
-
- ASSERT(list_empty(&bp->b_lru));
-
- if (xfs_buftarg_is_mem(bp->b_target))
- xmbuf_unmap_page(bp);
- else if (bp->b_flags & _XBF_PAGES)
- xfs_buf_free_pages(bp);
- else if (bp->b_flags & _XBF_KMEM)
- kfree(bp->b_addr);
-
- call_rcu(&bp->b_rcu, xfs_buf_free_callback);
-}
-
-static int
-xfs_buf_alloc_kmem(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
-{
- gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
- size_t size = BBTOB(bp->b_length);
-
- /* Assure zeroed buffer for non-read cases. */
- if (!(flags & XBF_READ))
- gfp_mask |= __GFP_ZERO;
-
- bp->b_addr = kmalloc(size, gfp_mask);
- if (!bp->b_addr)
- return -ENOMEM;
-
- if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
- ((unsigned long)bp->b_addr & PAGE_MASK)) {
- /* b_addr spans two pages - use alloc_page instead */
- kfree(bp->b_addr);
- bp->b_addr = NULL;
- return -ENOMEM;
- }
- bp->b_offset = offset_in_page(bp->b_addr);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = kmem_to_page(bp->b_addr);
- bp->b_page_count = 1;
- bp->b_flags |= _XBF_KMEM;
- return 0;
-}
-
-static int
-xfs_buf_alloc_pages(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
-{
- gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
- long filled = 0;
-
- if (flags & XBF_READ_AHEAD)
- gfp_mask |= __GFP_NORETRY;
-
- /* Make sure that we have a page list */
- bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
- if (bp->b_page_count <= XB_PAGES) {
- bp->b_pages = bp->b_page_array;
- } else {
- bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
- gfp_mask);
- if (!bp->b_pages)
- return -ENOMEM;
- }
- bp->b_flags |= _XBF_PAGES;
-
- /* Assure zeroed buffer for non-read cases. */
- if (!(flags & XBF_READ))
- gfp_mask |= __GFP_ZERO;
-
- /*
- * Bulk filling of pages can take multiple calls. Not filling the entire
- * array is not an allocation failure, so don't back off if we get at
- * least one extra page.
- */
- for (;;) {
- long last = filled;
-
- filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
- bp->b_pages);
- if (filled == bp->b_page_count) {
- XFS_STATS_INC(bp->b_mount, xb_page_found);
- break;
- }
-
- if (filled != last)
- continue;
-
- if (flags & XBF_READ_AHEAD) {
- xfs_buf_free_pages(bp);
- return -ENOMEM;
- }
-
- XFS_STATS_INC(bp->b_mount, xb_page_retries);
- memalloc_retry_wait(gfp_mask);
- }
- return 0;
-}
-
-/*
- * Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
-{
- ASSERT(bp->b_flags & _XBF_PAGES);
- if (bp->b_page_count == 1) {
- /* A single page buffer is always mappable */
- bp->b_addr = page_address(bp->b_pages[0]);
- } else if (flags & XBF_UNMAPPED) {
- bp->b_addr = NULL;
- } else {
- int retried = 0;
- unsigned nofs_flag;
-
- /*
- * vm_map_ram() will allocate auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
- * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
- * from the same call site that can be run from both above and
- * below memory reclaim causes lockdep false positives. Hence we
- * always need to force this allocation to nofs context because
- * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
- * prevent false positive lockdep reports.
- *
- * XXX(dgc): I think dquot reclaim is the only place we can get
- * to this function from memory reclaim context now. If we fix
- * that like we've fixed inode reclaim to avoid writeback from
- * reclaim, this nofs wrapping can go away.
- */
- nofs_flag = memalloc_nofs_save();
- do {
- bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1);
- if (bp->b_addr)
- break;
- vm_unmap_aliases();
- } while (retried++ <= 1);
- memalloc_nofs_restore(nofs_flag);
-
- if (!bp->b_addr)
- return -ENOMEM;
+ error = xfs_buf_alloc_backing_mem(bp, flags);
+ if (error) {
+ xfs_buf_free(bp);
+ return error;
}
+ *bpp = bp;
return 0;
}
@@ -507,7 +433,7 @@ xfs_buf_find_lock(
return -ENOENT;
}
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+ bp->b_flags &= _XBF_KMEM;
bp->b_ops = NULL;
}
return 0;
@@ -575,25 +501,10 @@ xfs_buf_find_insert(
struct xfs_buf *bp;
int error;
- error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
+ error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
goto out_drop_pag;
- if (xfs_buftarg_is_mem(new_bp->b_target)) {
- error = xmbuf_map_page(new_bp);
- } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
- xfs_buf_alloc_kmem(new_bp, flags) < 0) {
- /*
- * For buffers that fit entirely within a single page, first
- * attempt to allocate the memory from the heap to minimise
- * memory usage. If we can't get heap memory for these small
- * buffers, we fall back to using the page allocator.
- */
- error = xfs_buf_alloc_pages(new_bp, flags);
- }
- if (error)
- goto out_free_buf;
-
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
@@ -704,18 +615,6 @@ xfs_buf_get_map(
xfs_perag_put(pag);
}
- /* We do not hold a perag reference anymore. */
- if (!bp->b_addr) {
- error = _xfs_buf_map_pages(bp, flags);
- if (unlikely(error)) {
- xfs_warn_ratelimited(btp->bt_mount,
- "%s: failed to map %u pages", __func__,
- bp->b_page_count);
- xfs_buf_relse(bp);
- return error;
- }
- }
-
/*
* Clear b_error if this is a lookup from a caller that doesn't expect
* valid data to be found in the buffer.
@@ -903,7 +802,6 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -912,7 +810,7 @@ xfs_buf_read_uncached(
*bpp = NULL;
- error = xfs_buf_get_uncached(target, numblks, flags, &bp);
+ error = xfs_buf_get_uncached(target, numblks, &bp);
if (error)
return error;
@@ -938,42 +836,14 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
- struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
- /* there are currently no valid flags for xfs_buf_get_uncached */
- ASSERT(flags == 0);
-
- *bpp = NULL;
-
- error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
- if (error)
- return error;
-
- if (xfs_buftarg_is_mem(bp->b_target))
- error = xmbuf_map_page(bp);
- else
- error = xfs_buf_alloc_pages(bp, flags);
- if (error)
- goto fail_free_buf;
-
- error = _xfs_buf_map_pages(bp, 0);
- if (unlikely(error)) {
- xfs_warn(target->bt_mount,
- "%s: failed to map pages", __func__);
- goto fail_free_buf;
- }
-
- trace_xfs_buf_get_uncached(bp, _RET_IP_);
- *bpp = bp;
- return 0;
-
-fail_free_buf:
- xfs_buf_free(bp);
+ error = xfs_buf_alloc(target, &map, 1, 0, bpp);
+ if (!error)
+ trace_xfs_buf_get_uncached(*bpp, _RET_IP_);
return error;
}
@@ -1299,9 +1169,9 @@ __xfs_buf_ioend(
trace_xfs_buf_iodone(bp, _RET_IP_);
if (bp->b_flags & XBF_READ) {
- if (!bp->b_error && xfs_buf_is_vmapped(bp))
+ if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
invalidate_kernel_vmap_range(bp->b_addr,
- xfs_buf_vmap_len(bp));
+ roundup(BBTOB(bp->b_length), PAGE_SIZE));
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
@@ -1462,29 +1332,48 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
- unsigned int size = BBTOB(bp->b_length);
- unsigned int map = 0, p;
+ unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
- bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
- xfs_buf_bio_op(bp), GFP_NOIO);
- bio->bi_private = bp;
- bio->bi_end_io = xfs_buf_bio_end_io;
+ if (is_vmalloc_addr(bp->b_addr)) {
+ unsigned int size = BBTOB(bp->b_length);
+ unsigned int alloc_size = roundup(size, PAGE_SIZE);
+ void *data = bp->b_addr;
- if (bp->b_flags & _XBF_KMEM) {
- __bio_add_page(bio, virt_to_page(bp->b_addr), size,
- bp->b_offset);
- } else {
- for (p = 0; p < bp->b_page_count; p++)
- __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
- bio->bi_iter.bi_size = size; /* limit to the actual size used */
+ bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
+ xfs_buf_bio_op(bp), GFP_NOIO);
+
+ do {
+ unsigned int len = min(size, PAGE_SIZE);
+
+ ASSERT(offset_in_page(data) == 0);
+ __bio_add_page(bio, vmalloc_to_page(data), len, 0);
+ data += len;
+ size -= len;
+ } while (size);
- if (xfs_buf_is_vmapped(bp))
- flush_kernel_vmap_range(bp->b_addr,
- xfs_buf_vmap_len(bp));
+ flush_kernel_vmap_range(bp->b_addr, alloc_size);
+ } else {
+ /*
+ * Single folio or slab allocation. Must be contiguous and thus
+ * only a single bvec is needed.
+ *
+ * This uses the page based bio add helper for now as that is
+ * the lowest common denominator between folios and slab
+ * allocations. To be replaced with a better block layer
+ * helper soon (hopefully).
+ */
+ bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
+ GFP_NOIO);
+ __bio_add_page(bio, virt_to_page(bp->b_addr),
+ BBTOB(bp->b_length),
+ offset_in_page(bp->b_addr));
}
+ bio->bi_private = bp;
+ bio->bi_end_io = xfs_buf_bio_end_io;
+
/*
* If there is more than one map segment, split out a new bio for each
* map except of the last one. The last map is handled by the
@@ -1611,47 +1500,6 @@ xfs_buf_submit(
xfs_buf_submit_bio(bp);
}
-void *
-xfs_buf_offset(
- struct xfs_buf *bp,
- size_t offset)
-{
- struct page *page;
-
- if (bp->b_addr)
- return bp->b_addr + offset;
-
- page = bp->b_pages[offset >> PAGE_SHIFT];
- return page_address(page) + (offset & (PAGE_SIZE-1));
-}
-
-void
-xfs_buf_zero(
- struct xfs_buf *bp,
- size_t boff,
- size_t bsize)
-{
- size_t bend;
-
- bend = boff + bsize;
- while (boff < bend) {
- struct page *page;
- int page_index, page_offset, csize;
-
- page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
- page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
- page = bp->b_pages[page_index];
- csize = min_t(size_t, PAGE_SIZE - page_offset,
- BBTOB(bp->b_length) - boff);
-
- ASSERT((csize + page_offset) <= PAGE_SIZE);
-
- memset(page_address(page) + page_offset, 0, csize);
-
- boff += csize;
- }
-}
-
/*
* Log a message about and stale a buffer that a caller has decided is corrupt.
*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 80e06eecaf56..d0b065a9a9f0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -36,7 +36,6 @@ struct xfs_buf;
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1u << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
@@ -48,7 +47,6 @@ struct xfs_buf;
#define XBF_LIVESCAN (1u << 28)
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -62,14 +60,12 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
- { _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
{ XBF_LIVESCAN, "LIVESCAN" }, \
{ XBF_INCORE, "INCORE" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, \
- { XBF_UNMAPPED, "UNMAPPED" }
+ { XBF_TRYLOCK, "TRYLOCK" }
/*
* Internal state flags.
@@ -124,8 +120,6 @@ struct xfs_buftarg {
struct xfs_buf_cache bt_cache[];
};
-#define XB_PAGES 2
-
struct xfs_buf_map {
xfs_daddr_t bm_bn; /* block number for I/O */
int bm_len; /* size of I/O */
@@ -187,15 +181,10 @@ struct xfs_buf {
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
- struct page **b_pages; /* array of page pointers */
- struct page *b_page_array[XB_PAGES]; /* inline pages */
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
atomic_t b_pin_count; /* pin count */
- unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset of b_addr,
- only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
void (*b_iodone)(struct xfs_buf *bp);
@@ -284,9 +273,9 @@ xfs_buf_readahead(
}
int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
- xfs_buf_flags_t flags, struct xfs_buf **bpp);
+ struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ size_t numblks, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp);
void xfs_buf_hold(struct xfs_buf *bp);
@@ -315,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
-void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
-extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
+{
+ return bp->b_addr + offset;
+}
+
+static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
+{
+ memset(bp->b_addr + boff, 0, bsize);
+}
+
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 47549cfa61cd..19eb0b7a3e58 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -57,24 +57,6 @@ xfs_buf_log_format_size(
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
-static inline bool
-xfs_buf_item_straddle(
- struct xfs_buf *bp,
- uint offset,
- int first_bit,
- int nbits)
-{
- void *first, *last;
-
- first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
- last = xfs_buf_offset(bp,
- offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
-
- if (last - first != nbits * XFS_BLF_CHUNK)
- return true;
- return false;
-}
-
/*
* Return the number of log iovecs and space needed to log the given buf log
* item segment.
@@ -91,11 +73,8 @@ xfs_buf_item_size_segment(
int *nvecs,
int *nbytes)
{
- struct xfs_buf *bp = bip->bli_buf;
int first_bit;
int nbits;
- int next_bit;
- int last_bit;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (first_bit == -1)
@@ -108,15 +87,6 @@ xfs_buf_item_size_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
-
- /*
- * Straddling a page is rare because we don't log contiguous
- * chunks of unmapped buffers anywhere.
- */
- if (nbits > 1 &&
- xfs_buf_item_straddle(bp, offset, first_bit, nbits))
- goto slow_scan;
-
(*nvecs)++;
*nbytes += nbits * XFS_BLF_CHUNK;
@@ -131,40 +101,6 @@ xfs_buf_item_size_segment(
} while (first_bit != -1);
return;
-
-slow_scan:
- /* Count the first bit we jumped out of the above loop from */
- (*nvecs)++;
- *nbytes += XFS_BLF_CHUNK;
- last_bit = first_bit;
- while (last_bit != -1) {
- /*
- * This takes the bit number to start looking from and
- * returns the next set bit from there. It returns -1
- * if there are no more bits set or the start bit is
- * beyond the end of the bitmap.
- */
- next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
- last_bit + 1);
- /*
- * If we run out of bits, leave the loop,
- * else if we find a new set of bits bump the number of vecs,
- * else keep scanning the current set of bits.
- */
- if (next_bit == -1) {
- break;
- } else if (next_bit != last_bit + 1 ||
- xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
- last_bit = next_bit;
- first_bit = next_bit;
- (*nvecs)++;
- nbits = 1;
- } else {
- last_bit++;
- nbits++;
- }
- *nbytes += XFS_BLF_CHUNK;
- }
}
/*
@@ -277,8 +213,6 @@ xfs_buf_item_format_segment(
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
int first_bit;
- int last_bit;
- int next_bit;
uint nbits;
/* copy the flags across from the base format item */
@@ -323,15 +257,6 @@ xfs_buf_item_format_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
-
- /*
- * Straddling a page is rare because we don't log contiguous
- * chunks of unmapped buffers anywhere.
- */
- if (nbits > 1 &&
- xfs_buf_item_straddle(bp, offset, first_bit, nbits))
- goto slow_scan;
-
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
@@ -347,45 +272,6 @@ xfs_buf_item_format_segment(
} while (first_bit != -1);
return;
-
-slow_scan:
- ASSERT(bp->b_addr == NULL);
- last_bit = first_bit;
- nbits = 1;
- for (;;) {
- /*
- * This takes the bit number to start looking from and
- * returns the next set bit from there. It returns -1
- * if there are no more bits set or the start bit is
- * beyond the end of the bitmap.
- */
- next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
- (uint)last_bit + 1);
- /*
- * If we run out of bits fill in the last iovec and get out of
- * the loop. Else if we start a new set of bits then fill in
- * the iovec for the series we were looking at and start
- * counting the bits in the new one. Else we're still in the
- * same set of bits so just keep counting and scanning.
- */
- if (next_bit == -1) {
- xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
- first_bit, nbits);
- blfp->blf_size++;
- break;
- } else if (next_bit != last_bit + 1 ||
- xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
- xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
- first_bit, nbits);
- blfp->blf_size++;
- first_bit = next_bit;
- last_bit = next_bit;
- nbits = 1;
- } else {
- last_bit++;
- nbits++;
- }
- }
}
/*
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 05a2f6927c12..d4c5cef5bc43 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2(
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
- uint buf_flags;
xfs_lsn_t lsn;
/*
@@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2(
}
trace_xfs_log_recover_buf_recover(log, buf_f);
-
- buf_flags = 0;
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
- buf_flags |= XBF_UNMAPPED;
-
error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags, &bp, NULL);
+ 0, &bp, NULL);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 5b64a2b3b113..b4ffd80b7cb6 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -74,7 +74,7 @@ xmbuf_alloc(
/*
* We don't want to bother with kmapping data during repair, so don't
- * allow highmem pages to back this mapping.
+ * allow highmem folios to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
@@ -127,14 +127,13 @@ xmbuf_free(
kfree(btp);
}
-/* Directly map a shmem page into the buffer cache. */
+/* Directly map a shmem folio into the buffer cache. */
int
-xmbuf_map_page(
+xmbuf_map_backing_mem(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
- struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
@@ -159,39 +158,17 @@ xmbuf_map_page(
return -EIO;
}
- page = folio_file_page(folio, pos >> PAGE_SHIFT);
-
/*
- * Mark the page dirty so that it won't be reclaimed once we drop the
- * (potentially last) reference in xmbuf_unmap_page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfs_buf_free.
*/
- set_page_dirty(page);
- unlock_page(page);
+ folio_set_dirty(folio);
+ folio_unlock(folio);
- bp->b_addr = page_address(page);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = page;
- bp->b_page_count = 1;
+ bp->b_addr = folio_address(folio);
return 0;
}
-/* Unmap a shmem page that was mapped into the buffer cache. */
-void
-xmbuf_unmap_page(
- struct xfs_buf *bp)
-{
- struct page *page = bp->b_pages[0];
-
- ASSERT(xfs_buftarg_is_mem(bp->b_target));
-
- put_page(page);
-
- bp->b_addr = NULL;
- bp->b_pages[0] = NULL;
- bp->b_pages = NULL;
- bp->b_page_count = 0;
-}
-
/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
@@ -205,7 +182,7 @@ xmbuf_verify_daddr(
return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}
-/* Discard the page backing this buffer. */
+/* Discard the folio backing this buffer. */
static void
xmbuf_stale(
struct xfs_buf *bp)
@@ -220,7 +197,7 @@ xmbuf_stale(
}
/*
- * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * Finalize a buffer -- discard the backing folio if it's stale, or run the
* write verifier to detect problems.
*/
int
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
index eed4a7b63232..67d525cc1513 100644
--- a/fs/xfs/xfs_buf_mem.h
+++ b/fs/xfs/xfs_buf_mem.h
@@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
-int xmbuf_map_page(struct xfs_buf *bp);
-void xmbuf_unmap_page(struct xfs_buf *bp);
bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
int xmbuf_finalize(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
-# define xmbuf_map_page(...) (-ENOMEM)
-# define xmbuf_unmap_page(...) ((void)0)
# define xmbuf_verify_daddr(...) (false)
#endif /* CONFIG_XFS_MEMORY_BUFS */
+int xmbuf_map_backing_mem(struct xfs_buf *bp);
+
#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 3f2403a7b49c..c1a306268ae4 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -844,7 +844,8 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp &&
+
+ if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ea43c9a6e54c..da3161572735 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
while ((pag = xfs_perag_next(mp, pag)))
xfs_extent_busy_wait_group(pag_group(pag));
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_extent_busy_wait_group(rtg_group(rtg));
}
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888..777438b853da 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
trace_xfs_extent_free_deferred(mp, xefi);
- if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
- if (*rtgp != to_rtg(xefi->xefi_group)) {
- *rtgp = to_rtg(xefi->xefi_group);
- xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
- xfs_rtgroup_trans_join(tp, *rtgp,
- XFS_RTGLOCK_BITMAP);
- }
- error = xfs_rtfree_blocks(tp, *rtgp,
- xefi->xefi_startblock, xefi->xefi_blockcount);
+ if (xefi->xefi_flags & XFS_EFI_CANCELLED)
+ goto done;
+
+ if (*rtgp != to_rtg(xefi->xefi_group)) {
+ unsigned int lock_flags;
+
+ if (xfs_has_zoned(mp))
+ lock_flags = XFS_RTGLOCK_RMAP;
+ else
+ lock_flags = XFS_RTGLOCK_BITMAP;
+
+ *rtgp = to_rtg(xefi->xefi_group);
+ xfs_rtgroup_lock(*rtgp, lock_flags);
+ xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
}
+
+ if (xfs_has_zoned(mp)) {
+ error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ } else {
+ error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ }
+
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
-
+done:
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..84f08c976ac4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,8 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -150,7 +152,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
@@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
- bool *drained_dio)
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@@ -431,7 +434,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- unsigned int *iolock)
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@@ -481,7 +485,7 @@ restart:
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
- &drained_dio);
+ &drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@@ -491,6 +495,48 @@ restart:
return kiocb_modified(iocb);
}
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
+ /*
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
+ *
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
+ */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(ip,
+ XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
+}
+
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -503,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_IOEND_DIRECT);
+ xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@@ -597,7 +683,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@@ -611,11 +697,31 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, NULL, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
- if (iolock)
- xfs_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
@@ -675,7 +781,7 @@ retry_exclusive:
goto out_unlock;
}
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -721,9 +827,21 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+
+ /*
+ * For always COW inodes we also must check the alignment of each
+ * individual iovec segment, as they could end up with different
+ * I/Os due to the way bio_iov_iter_get_pages works, and we'd
+ * then overwrite an already written block.
+ */
+ if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
+ (xfs_is_always_cow_inode(ip) &&
+ (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
- return xfs_file_dio_write_aligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
@@ -740,7 +858,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -784,7 +902,7 @@ write_retry:
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -832,6 +950,67 @@ out:
}
STATIC ssize_t
+xfs_file_buffered_write_zoned(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
+STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
@@ -878,6 +1057,8 @@ xfs_file_write_iter(
return ret;
}
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@@ -932,7 +1113,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@@ -948,7 +1130,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
- error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1004,7 +1186,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@@ -1017,7 +1200,7 @@ xfs_falloc_zero_range(
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len);
+ error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range(
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
-xfs_file_fallocate(
+__xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
- if (mode & ~XFS_FALLOC_FL_SUPPORTED)
- return -EOPNOTSUPP;
-
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@@ -1124,16 +1303,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
- error = xfs_falloc_collapse_range(file, offset, len);
+ error = xfs_falloc_collapse_range(file, offset, len, ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
- error = xfs_falloc_zero_range(file, mode, offset, len);
+ error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1154,6 +1333,54 @@ out_unlock:
return error;
}
+static long
+xfs_file_zoned_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int error;
+
+ error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ error = __xfs_file_fallocate(file, mode, offset, len, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
+ return error;
+}
+
+static long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(XFS_I(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE)))
+ return xfs_file_zoned_fallocate(file, mode, offset, len);
+ return __xfs_file_fallocate(file, mode, offset, len, NULL);
+}
+
STATIC int
xfs_file_fadvise(
struct file *file,
@@ -1347,15 +1574,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
+ * Inodes on the zoned RT device never have preallocations, so skip
+ * taking the locks below.
+ */
+ if (!inode->i_nlink ||
+ !(file->f_mode & FMODE_WRITE) ||
+ (ip->i_diflags & XFS_DIFLAG_APPEND) ||
+ xfs_is_zoned_inode(ip))
+ return 0;
+
+ /*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
- if (inode->i_nlink &&
- (file->f_mode & FMODE_WRITE) &&
- !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
- !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
@@ -1469,9 +1703,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
-xfs_write_fault(
+__xfs_write_fault(
struct vm_fault *vmf,
- unsigned int order)
+ unsigned int order,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1498,13 +1733,50 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
+static vm_fault_t
+xfs_write_fault_zoned(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ unsigned int len = folio_size(page_folio(vmf->page));
+ struct xfs_zone_alloc_ctx ac = { };
+ int error;
+ vm_fault_t ret;
+
+ /*
+ * This could over-allocate as it doesn't check for truncation.
+ *
+ * But as the overallocation is limited to less than a folio and will be
+ * release instantly that's just fine.
+ */
+ error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
+ &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ ret = __xfs_write_fault(vmf, order, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
+ return ret;
+}
+
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
+ return xfs_write_fault_zoned(vmf, order);
+ return __xfs_write_fault(vmf, order, NULL);
+}
+
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
@@ -1613,7 +1885,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 1dbd2d75f7ae..a4bc1642fe56 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
+ xfs_daddr_t rtstart_daddr;
xfs_rtblock_t start_rtb;
xfs_rtblock_t end_rtb;
xfs_rgnumber_t start_rg, end_rg;
uint64_t eofs;
int error = 0;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+
+ rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
+ if (keys[0].fmr_physical < rtstart_daddr) {
+ struct xfs_fsmap_irec frec = {
+ .owner = XFS_RMAP_OWN_FS,
+ .len_daddr = rtstart_daddr,
+ };
+
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0) {
+ info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
+ return 0;
+ }
+
+ /* Fabricate an rmap entry for space occupied by the data dev */
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+ }
+
+ start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
+ min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
@@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
}
#endif /* CONFIG_XFS_RT */
+static uint32_t
+xfs_getfsmap_device(
+ struct xfs_mount *mp,
+ enum xfs_device dev)
+{
+ if (mp->m_sb.sb_rtstart)
+ return dev;
+
+ switch (dev) {
+ case XFS_DEV_DATA:
+ return new_encode_dev(mp->m_ddev_targp->bt_dev);
+ case XFS_DEV_LOG:
+ return new_encode_dev(mp->m_logdev_targp->bt_dev);
+ case XFS_DEV_RT:
+ if (!mp->m_rtdev_targp)
+ break;
+ return new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ }
+
+ return -1;
+}
+
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
struct xfs_mount *mp,
struct xfs_fsmap *fm)
{
- if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
- fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
- return true;
- if (mp->m_logdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
- return true;
- if (mp->m_rtdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
- return true;
- return false;
+ return fm->fmr_device == 0 ||
+ fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
+ (mp->m_rtdev_targp &&
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
}
/* Ensure that the low key is less than the high key. */
@@ -1126,7 +1166,7 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
- handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
@@ -1134,13 +1174,17 @@ xfs_getfsmap(
if (mp->m_logdev_targp != mp->m_ddev_targp) {
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
mp->m_sb.sb_logblocks);
- handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
- if (mp->m_rtdev_targp) {
+ /*
+ * For zoned file systems there is no rtbitmap, so only support fsmap
+ * if the callers is privileged enough to use the full rmap version.
+ */
+ if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
if (use_rmap)
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
else
@@ -1230,7 +1274,13 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
- head->fmh_oflags = FMH_OF_DEV_T;
+
+ /*
+ * For internal RT device we need to report different synthetic devices
+ * for a single physical device, and thus can't report the actual dev_t.
+ */
+ if (!mp->m_sb.sb_rtstart)
+ head->fmh_oflags = FMH_OF_DEV_T;
return error;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 455298503d01..0ada73569394 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,6 +24,7 @@
#include "xfs_rtalloc.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_metafile.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -110,7 +111,7 @@ xfs_growfs_data_private(
if (nb > mp->m_sb.sb_dblocks) {
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error)
return error;
xfs_buf_relse(bp);
@@ -300,24 +301,30 @@ xfs_growfs_data(
struct xfs_mount *mp,
struct xfs_growfs_data *in)
{
- int error = 0;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
+ /* we can't grow the data section when an internal RT section exists */
+ if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
if (error)
- goto out_error;
+ goto out_unlock;
}
if (in->newblocks != mp->m_sb.sb_dblocks) {
error = xfs_growfs_data_private(mp, in);
if (error)
- goto out_error;
+ goto out_unlock;
}
/* Post growfs calculations needed to reflect new state in operations */
@@ -331,13 +338,12 @@ xfs_growfs_data(
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
-out_error:
/*
- * Increment the generation unconditionally, the error could be from
- * updating the secondary superblocks, in which case the new size
- * is live already.
+ * Increment the generation unconditionally, after trying to update the
+ * secondary superblocks, as the new size is live already at this point.
*/
mp->m_generation++;
+out_unlock:
mutex_unlock(&mp->m_growlock);
return error;
}
@@ -366,6 +372,7 @@ xfs_growfs_log(
int
xfs_reserve_blocks(
struct xfs_mount *mp,
+ enum xfs_free_counter ctr,
uint64_t request)
{
int64_t lcounter, delta;
@@ -373,6 +380,8 @@ xfs_reserve_blocks(
int64_t free;
int error = 0;
+ ASSERT(ctr < XC_FREE_NR);
+
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -391,16 +400,16 @@ xfs_reserve_blocks(
* counters directly since we shouldn't have any problems unreserving
* space.
*/
- if (mp->m_resblks > request) {
- lcounter = mp->m_resblks_avail - request;
+ if (mp->m_free[ctr].res_total > request) {
+ lcounter = mp->m_free[ctr].res_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
- mp->m_resblks_avail -= lcounter;
+ mp->m_free[ctr].res_avail -= lcounter;
}
- mp->m_resblks = request;
+ mp->m_free[ctr].res_total = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -409,7 +418,7 @@ xfs_reserve_blocks(
/*
* If the request is larger than the current reservation, reserve the
- * blocks before we update the reserve counters. Sample m_fdblocks and
+ * blocks before we update the reserve counters. Sample m_free and
* perform a partial reservation if the request exceeds free space.
*
* The code below estimates how many blocks it can request from
@@ -419,10 +428,10 @@ xfs_reserve_blocks(
* space to fill it because mod_fdblocks will refill an undersized
* reserve when it can.
*/
- free = percpu_counter_sum(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- delta = request - mp->m_resblks;
- mp->m_resblks = request;
+ free = xfs_sum_freecounter_raw(mp, ctr) -
+ xfs_freecounter_unavailable(mp, ctr);
+ delta = request - mp->m_free[ctr].res_total;
+ mp->m_free[ctr].res_total = request;
if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
@@ -436,9 +445,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
- error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
+ error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
if (!error)
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks(
return error;
}
- if (xfs_has_realtime(mp)) {
- err2 = xfs_rt_resv_init(mp);
- if (err2 && err2 != -ENOSPC) {
- xfs_warn(mp,
- "Error %d reserving realtime metadata reserve pool.", err2);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- }
+ err2 = xfs_metafile_resv_init(mp);
+ if (err2 && err2 != -ENOSPC) {
+ xfs_warn(mp,
+ "Error %d reserving realtime metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- if (err2 && !error)
+ if (!error)
error = err2;
}
@@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks(
{
struct xfs_perag *pag = NULL;
- if (xfs_has_realtime(mp))
- xfs_rt_resv_free(mp);
-
+ xfs_metafile_resv_free(mp);
while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 3e2f73bcf831..9d23c361ef56 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,8 @@
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
+ uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7b6c026d01a1..2f53ca7e12d4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file(
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_REALTIME_INODE(ip))
+ if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
- if (__percpu_counter_compare(&mp->m_frextents,
+ if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
@@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
if (items > mp->m_ino_geo.inodes_per_cluster)
return true;
- if (__percpu_counter_compare(&mp->m_fdblocks,
+ if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
mp->m_low_space[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b1f9f156ec88..ce6b8ffbaa2c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1721,8 +1721,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * igeo->blocks_per_cluster,
- XBF_UNMAPPED, &bp);
+ mp->m_bsize * igeo->blocks_per_cluster, 0, &bp);
if (error)
return error;
@@ -3074,5 +3073,6 @@ bool
xfs_is_always_cow_inode(
const struct xfs_inode *ip)
{
- return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+ return xfs_is_zoned_inode(ip) ||
+ (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c08093a65352..4bb7a99e0dc4 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -25,19 +25,9 @@ struct xfs_dquot;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
- union {
- struct {
- struct xfs_dquot *i_udquot; /* user dquot */
- struct xfs_dquot *i_gdquot; /* group dquot */
- struct xfs_dquot *i_pdquot; /* project dquot */
- };
-
- /*
- * Space that has been set aside to accomodate expansions of a
- * metadata btree rooted in this file.
- */
- uint64_t i_meta_resv_asked;
- };
+ struct xfs_dquot *i_udquot; /* user dquot */
+ struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
@@ -69,8 +59,13 @@ typedef struct xfs_inode {
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
xfs_extlen_t i_extsize; /* basic/minimum extent size */
- /* cowextsize is only used for v3 inodes, flushiter for v1/2 */
+ /*
+ * i_used_blocks is used for zoned rtrmap inodes,
+ * i_cowextsize is used for other v3 inodes,
+ * i_flushiter for v1/2 inodes
+ */
union {
+ uint32_t i_used_blocks; /* used blocks in RTG */
xfs_extlen_t i_cowextsize; /* basic cow extent size */
uint16_t i_flushiter; /* incremented on flush */
};
@@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
+static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
+{
+ return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
+}
+
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 35803fcf0beb..40fc1bf900af 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index f3bfb814378c..7205fd14f6b3 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ed85322507dd..d250f7f74e3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_reserve_blocks(mp, fsop.resblks);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
mnt_drop_write_file(filp);
if (error)
return error;
}
spin_lock(&mp->m_sb_lock);
- fsop.resblks = mp->m_resblks;
- fsop.resblks_avail = mp->m_resblks_avail;
+ fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
+ fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(arg, &fsop, sizeof(fsop)))
@@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
struct xfs_fsop_counts out = {
.allocino = percpu_counter_read_positive(&mp->m_icount),
.freeino = percpu_counter_read_positive(&mp->m_ifree),
- .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp),
- .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
+ xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
+ .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
};
if (copy_to_user(uarg, &out, sizeof(out)))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d61460309a78..cb23c8871f81 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,6 +30,8 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -431,13 +433,14 @@ xfs_quota_calc_throttle(
static int64_t
xfs_iomap_freesp(
- struct percpu_counter *counter,
+ struct xfs_mount *mp,
+ unsigned int idx,
uint64_t low_space[XFS_LOWSP_MAX],
int *shift)
{
int64_t freesp;
- freesp = percpu_counter_read_positive(counter);
+ freesp = xfs_estimate_freecounter(mp, idx);
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
*shift = 2;
if (freesp < low_space[XFS_LOWSP_4_PCNT])
@@ -536,10 +539,10 @@ xfs_iomap_prealloc_size(
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
freesp = xfs_rtbxlen_to_blen(mp,
- xfs_iomap_freesp(&mp->m_frextents,
+ xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts, &shift));
else
- freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+ freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
&shift);
/*
@@ -828,6 +831,10 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ /* HW-offload atomics are always used in this path */
+ if (flags & IOMAP_ATOMIC)
+ iomap_flags |= IOMAP_F_ATOMIC_BIO;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
@@ -962,6 +969,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple. The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+ /*
+ * Needs to be pushed down into the allocator so that only writes into
+ * a single zone can be supported.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ /*
+ * Ensure the extent list is in memory in so that we don't have to do
+ * read it from the I/O completion handler.
+ */
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ }
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DIRTY;
+ iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+ iomap->offset = offset;
+ iomap->length = length;
+ iomap->flags = IOMAP_F_ANON_WRITE;
+ return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+ .iomap_begin = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
@@ -987,6 +1047,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
+/*
+ * Convert a hole to a delayed allocation.
+ */
+static void
+xfs_bmap_add_extent_hole_delay(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *new) /* new data to add to file extents */
+{
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_filblks_t newlen=0; /* new indirect size */
+ xfs_filblks_t oldlen=0; /* old indirect size */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(isnullstartblock(new->br_startblock));
+
+ /*
+ * Check and set flags if this segment has a left neighbor
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if the current (right) segment exists.
+ * If it doesn't exist, we're converting the hole at end-of-file.
+ */
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * Set contiguity flags on the left and right neighbors.
+ * Don't let extents get too large, even if the pieces are contiguous.
+ */
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the contiguity flags.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with delayed allocations
+ * on the left and on the right.
+ * Merge all three into a single extent record.
+ */
+ temp = left.br_blockcount + new->br_blockcount +
+ right.br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ temp = left.br_blockcount + new->br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iext_insert(ip, icur, new, state);
+ break;
+ }
+ if (oldlen != newlen) {
+ ASSERT(oldlen > newlen);
+ xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
+
+ /*
+ * Nothing to do for disk quota accounting here.
+ */
+ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
+ }
+}
+
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
+static int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *icur,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ uint64_t fdblocks;
+ int error;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+
+retry:
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ aoff = off;
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
+
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
+ struct xfs_bmbt_irec prev;
+ xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
+
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_quota_reserve_blkres(ip, alen);
+ if (error)
+ goto out;
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ fdblocks = indlen;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ ASSERT(!xfs_is_zoned_inode(ip));
+ error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+ if (error)
+ goto out_unreserve_quota;
+ } else {
+ fdblocks += alen;
+ }
+
+ error = xfs_dec_fdblocks(mp, fdblocks, false);
+ if (error)
+ goto out_unreserve_frextents;
+
+ ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip, alen, indlen);
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
+
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
+ return 0;
+
+out_unreserve_frextents:
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
+ return error;
+}
+
+static int
+xfs_zoned_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ u16 iomap_flags = IOMAP_F_SHARED;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ xfs_filblks_t count_fsb;
+ xfs_extlen_t indlen;
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+
+ ASSERT(!xfs_get_extsz_hint(ip));
+ ASSERT(!(flags & IOMAP_UNSHARE));
+ ASSERT(ac);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For zeroing operations check if there is any data to zero first.
+ *
+ * For regular writes we always need to allocate new blocks, but need to
+ * provide the source mapping when the range is unaligned to support
+ * read-modify-write of the whole block in the page cache.
+ *
+ * In either case we need to limit the reported range to the boundaries
+ * of the source map in the data fork.
+ */
+ if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+ !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+ (flags & IOMAP_ZERO)) {
+ struct xfs_bmbt_irec smap;
+ struct xfs_iext_cursor scur;
+
+ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+ &smap))
+ smap.br_startoff = end_fsb; /* fake hole until EOF */
+ if (smap.br_startoff > offset_fsb) {
+ /*
+ * We never need to allocate blocks for zeroing a hole.
+ */
+ if (flags & IOMAP_ZERO) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb,
+ smap.br_startoff);
+ goto out_unlock;
+ }
+ end_fsb = min(end_fsb, smap.br_startoff);
+ } else {
+ end_fsb = min(end_fsb,
+ smap.br_startoff + smap.br_blockcount);
+ xfs_trim_extent(&smap, offset_fsb,
+ end_fsb - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+ xfs_iomap_inode_sequence(ip, 0));
+ if (error)
+ goto out_unlock;
+ }
+ }
+
+ if (!ip->i_cowfp)
+ xfs_ifork_init_cow(ip);
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = end_fsb;
+ if (got.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &got);
+ goto done;
+ }
+
+ /*
+ * Cap the maximum length to keep the chunks of work done here somewhat
+ * symmetric with the work writeback does.
+ */
+ end_fsb = min(end_fsb, got.br_startoff);
+ count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+ XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+ /*
+ * The block reservation is supposed to cover all blocks that the
+ * operation could possible write, but there is a nasty corner case
+ * where blocks could be stolen from underneath us:
+ *
+ * 1) while this thread iterates over a larger buffered write,
+ * 2) another thread is causing a write fault that calls into
+ * ->page_mkwrite in range this thread writes to, using up the
+ * delalloc reservation created by a previous call to this function.
+ * 3) another thread does direct I/O on the range that the write fault
+ * happened on, which causes writeback of the dirty data.
+ * 4) this then set the stale flag, which cuts the current iomap
+ * iteration short, causing the new call to ->iomap_begin that gets
+ * us here again, but now without a sufficient reservation.
+ *
+ * This is a very unusual I/O pattern, and nothing but generic/095 is
+ * known to hit it. There's not really much we can do here, so turn this
+ * into a short write.
+ */
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+ ip->i_ino, current->comm);
+ count_fsb = ac->reserved_blocks;
+ if (!count_fsb) {
+ error = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ error = xfs_quota_reserve_blkres(ip, count_fsb);
+ if (error)
+ goto out_unlock;
+
+ indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+ error = xfs_dec_fdblocks(mp, indlen, false);
+ if (error)
+ goto out_unlock;
+ ip->i_delayed_blks += count_fsb;
+ xfs_mod_delalloc(ip, count_fsb, indlen);
+
+ got.br_startoff = offset_fsb;
+ got.br_startblock = nullstartblock(indlen);
+ got.br_blockcount = count_fsb;
+ got.br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+ ac->reserved_blocks -= count_fsb;
+ iomap_flags |= IOMAP_F_NEW;
+
+ trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+ XFS_COW_FORK, &got);
+done:
+ error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+ xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1013,6 +1522,10 @@ xfs_buffered_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_is_zoned_inode(ip))
+ return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+ count, flags, iomap, srcmap);
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1245,10 +1758,13 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
- offset, offset + length);
+ offset, offset + length, iter->private);
}
static int
@@ -1485,6 +2001,7 @@ xfs_zero_range(
struct xfs_inode *ip,
loff_t pos,
loff_t len,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1495,13 +2012,14 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1510,5 +2028,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, ac);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8347268af727..d330c4a581b1 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
- bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -49,6 +51,7 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 40289fe6f5b2..756bd3ca8e00 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -298,14 +299,14 @@ xfs_vn_create(
return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
}
-STATIC int
+STATIC struct dentry *
xfs_vn_mkdir(
struct mnt_idmap *idmap,
struct inode *dir,
struct dentry *dentry,
umode_t mode)
{
- return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
+ return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL));
}
STATIC struct dentry *
@@ -854,6 +855,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
+ struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@@ -890,6 +892,28 @@ xfs_setattr_size(
inode_dio_wait(inode);
/*
+ * Normally xfs_zoned_space_reserve is supposed to be called outside the
+ * IOLOCK. For truncate we can't do that since ->setattr is called with
+ * it already held by the VFS. So for now chicken out and try to
+ * allocate space under it.
+ *
+ * To avoid deadlocks this means we can't block waiting for space, which
+ * can lead to spurious -ENOSPC if there are no directly available
+ * blocks. We mitigate this a bit by allowing zeroing to dip into the
+ * reserved pool, but eventually the VFS calling convention needs to
+ * change.
+ */
+ if (xfs_is_zoned_inode(ip)) {
+ error = xfs_zoned_space_reserve(ip, 1,
+ XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+ if (error) {
+ if (error == -EAGAIN)
+ return -ENOSPC;
+ return error;
+ }
+ }
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
@@ -902,11 +926,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
- &did_zeroing);
+ &ac, &did_zeroing);
} else {
- error = xfs_truncate_page(ip, newsize, &did_zeroing);
+ error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
+
if (error)
return error;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f8851ff835de..6493bdb57351 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -20,6 +20,7 @@
#include "xfs_sysfs.h"
#include "xfs_sb.h"
#include "xfs_health.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_log_ticket_cache;
@@ -3540,6 +3541,9 @@ xlog_force_shutdown(
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
+ xfs_zoned_wake_all(log->l_mp);
+
return log_error;
}
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 6ed485ff2756..15d410d16bb2 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -173,6 +173,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
+ [XFS_EXPERIMENTAL_ZONED] = {
+ .opstate = XFS_OPSTATE_WARNED_ZONED,
+ .name = "zoned RT device",
+ },
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb36ced9df7..a92a4d09c8e9 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -99,6 +99,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
+ XFS_EXPERIMENTAL_ZONED,
XFS_EXPERIMENTAL_MAX,
};
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b69356582b86..00b53f479ece 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,7 @@
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
+#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -185,7 +186,7 @@ xfs_readsb(
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0, &bp, buf_ops);
+ BTOBB(sector_size), &bp, buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
@@ -413,7 +414,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "last sector read failed");
return error;
@@ -430,7 +431,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSB_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "log device read failed");
return error;
@@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags(
return xfs_sync_sb(mp, false);
}
+static const char *const xfs_free_pool_name[] = {
+ [XC_FREE_BLOCKS] = "free blocks",
+ [XC_FREE_RTEXTENTS] = "free rt extents",
+ [XC_FREE_RTAVAILABLE] = "available rt extents",
+};
+
uint64_t
-xfs_default_resblks(xfs_mount_t *mp)
+xfs_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
{
- uint64_t resblks;
-
- /*
- * We default to 5% or 8192 fsbs of space reserved, whichever is
- * smaller. This is intended to cover concurrent allocation
- * transactions when we initially hit enospc. These each require a 4
- * block reservation. Hence by default we cover roughly 2000 concurrent
- * allocation reservations.
- */
- resblks = mp->m_sb.sb_dblocks;
- do_div(resblks, 20);
- resblks = min_t(uint64_t, resblks, 8192);
- return resblks;
+ switch (ctr) {
+ case XC_FREE_BLOCKS:
+ /*
+ * Default to 5% or 8192 FSBs of space reserved, whichever is
+ * smaller.
+ *
+ * This is intended to cover concurrent allocation transactions
+ * when we initially hit ENOSPC. These each require a 4 block
+ * reservation. Hence by default we cover roughly 2000
+ * concurrent allocation reservations.
+ */
+ return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+ case XC_FREE_RTEXTENTS:
+ case XC_FREE_RTAVAILABLE:
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ return xfs_zoned_default_resblks(mp, ctr);
+ return 0;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
/* Ensure the summary counts are correct. */
@@ -543,7 +560,7 @@ xfs_check_summary_counts(
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
- if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
@@ -678,6 +695,7 @@ xfs_mountfs(
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
+ int i;
xfs_sb_mount_common(mp, sbp);
@@ -747,27 +765,15 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
- super_set_sysfs_name_id(mp->m_super);
-
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
- NULL, mp->m_super->s_id);
- if (error)
- goto out;
-
- error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
- &mp->m_kobj, "stats");
+ error = xfs_mount_sysfs_init(mp);
if (error)
- goto out_remove_sysfs;
+ goto out_remove_scrub_stats;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
- error = xfs_error_sysfs_init(mp);
- if (error)
- goto out_remove_scrub_stats;
-
error = xfs_errortag_init(mp);
if (error)
- goto out_remove_error_sysfs;
+ goto out_remove_sysfs;
error = xfs_uuid_mount(mp);
if (error)
@@ -1031,6 +1037,12 @@ xfs_mountfs(
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
+ if (xfs_has_zoned(mp)) {
+ error = xfs_mount_zones(mp);
+ if (error)
+ goto out_rtunmount;
+ }
+
/*
* Complete the quota initialisation, post-log-replay component.
*/
@@ -1046,22 +1058,28 @@ xfs_mountfs(
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
- * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
- * are not allowed to use this reserved space.
+ * attr, unwritten extent conversion at ENOSPC, garbage collection
+ * etc. Data allocations are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
- if (error)
- xfs_warn(mp,
- "Unable to allocate reserve blocks. Continuing without reserve pool.");
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = xfs_reserve_blocks(mp, i,
+ xfs_default_resblks(mp, i));
+ if (error)
+ xfs_warn(mp,
+"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
+ xfs_free_pool_name[i]);
+ }
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
+
+ xfs_zone_gc_start(mp);
}
return 0;
@@ -1069,6 +1087,8 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
@@ -1116,13 +1136,10 @@ xfs_mountfs(
xfs_uuid_unmount(mp);
out_remove_errortag:
xfs_errortag_del(mp);
- out_remove_error_sysfs:
- xfs_error_sysfs_del(mp);
+ out_remove_sysfs:
+ xfs_mount_sysfs_del(mp);
out_remove_scrub_stats:
xchk_stats_unregister(mp->m_scrub_stats);
- xfs_sysfs_del(&mp->m_stats.xs_kobj);
- out_remove_sysfs:
- xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
@@ -1148,8 +1165,12 @@ xfs_unmountfs(
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
+ if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+ xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
if (mp->m_metadirip)
@@ -1173,7 +1194,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- error = xfs_reserve_blocks(mp, 0);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
@@ -1195,10 +1216,8 @@ xfs_unmountfs(
xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
- xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
- xfs_sysfs_del(&mp->m_stats.xs_kobj);
- xfs_sysfs_del(&mp->m_kobj);
+ xfs_mount_sysfs_del(mp);
}
/*
@@ -1220,52 +1239,67 @@ xfs_fs_writable(
return true;
}
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+uint64_t
+xfs_freecounter_unavailable(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ if (ctr != XC_FREE_BLOCKS)
+ return 0;
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
void
xfs_add_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta)
{
- bool has_resv_pool = (counter == &mp->m_fdblocks);
+ struct xfs_freecounter *counter = &mp->m_free[ctr];
uint64_t res_used;
/*
* If the reserve pool is depleted, put blocks back into it first.
* Most of the time the pool is full.
*/
- if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
- percpu_counter_add(counter, delta);
+ if (likely(counter->res_avail == counter->res_total)) {
+ percpu_counter_add(&counter->count, delta);
return;
}
spin_lock(&mp->m_sb_lock);
- res_used = mp->m_resblks - mp->m_resblks_avail;
+ res_used = counter->res_total - counter->res_avail;
if (res_used > delta) {
- mp->m_resblks_avail += delta;
+ counter->res_avail += delta;
} else {
delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(counter, delta);
+ counter->res_avail = counter->res_total;
+ percpu_counter_add(&counter->count, delta);
}
spin_unlock(&mp->m_sb_lock);
}
+
+/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta,
bool rsvd)
{
- int64_t lcounter;
- uint64_t set_aside = 0;
+ struct xfs_freecounter *counter = &mp->m_free[ctr];
s32 batch;
- bool has_resv_pool;
- ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
- has_resv_pool = (counter == &mp->m_fdblocks);
- if (rsvd)
- ASSERT(has_resv_pool);
+ ASSERT(ctr < XC_FREE_NR);
/*
* Taking blocks away, need to be more accurate the closer we
@@ -1275,7 +1309,7 @@ xfs_dec_freecounter(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1292,34 +1326,34 @@ xfs_dec_freecounter(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- if (has_resv_pool)
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(counter, -((int64_t)delta), batch);
- if (__percpu_counter_compare(counter, set_aside,
- XFS_FDBLOCKS_BATCH) >= 0) {
- /* we had space! */
- return 0;
- }
-
- /*
- * lock up the sb for dipping into reserves before releasing the space
- * that took us to ENOSPC.
- */
- spin_lock(&mp->m_sb_lock);
- percpu_counter_add(counter, delta);
- if (!has_resv_pool || !rsvd)
- goto fdblocks_enospc;
-
- lcounter = (long long)mp->m_resblks_avail - delta;
- if (lcounter >= 0) {
- mp->m_resblks_avail = lcounter;
+ percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
+ if (__percpu_counter_compare(&counter->count,
+ xfs_freecounter_unavailable(mp, ctr),
+ XFS_FDBLOCKS_BATCH) < 0) {
+ /*
+ * Lock up the sb for dipping into reserves before releasing the
+ * space that took us to ENOSPC.
+ */
+ spin_lock(&mp->m_sb_lock);
+ percpu_counter_add(&counter->count, delta);
+ if (!rsvd)
+ goto fdblocks_enospc;
+ if (delta > counter->res_avail) {
+ if (ctr == XC_FREE_BLOCKS)
+ xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+ goto fdblocks_enospc;
+ }
+ counter->res_avail -= delta;
+ trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
- return 0;
}
- xfs_warn_once(mp,
-"Reserve blocks depleted! Consider increasing reserve pool size.");
+
+ /* we had space! */
+ return 0;
fdblocks_enospc:
+ trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbed172d6770..799b84220ebb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -98,11 +98,41 @@ struct xfs_groups {
uint8_t blklog;
/*
+ * Zoned devices can have gaps beyond the usable capacity of a zone and
+ * the end in the LBA/daddr address space. In other words, the hardware
+ * equivalent to the RT groups already takes care of the power of 2
+ * alignment for us. In this case the sparse FSB/RTB address space maps
+ * 1:1 to the device address space.
+ */
+ bool has_daddr_gaps;
+
+ /*
* Mask to extract the group-relative block number from a FSB.
* For a pre-rtgroups filesystem we pretend to have one very large
* rtgroup, so this mask must be 64-bit.
*/
uint64_t blkmask;
+
+ /*
+ * Start of the first group in the device. This is used to support a
+ * RT device following the data device on the same block device for
+ * SMR hard drives.
+ */
+ xfs_fsblock_t start_fsb;
+};
+
+struct xfs_freecounter {
+ /* free blocks for general use: */
+ struct percpu_counter count;
+
+ /* total reserved blocks: */
+ uint64_t res_total;
+
+ /* available reserved blocks: */
+ uint64_t res_avail;
+
+ /* reserved blks @ remount,ro: */
+ uint64_t res_saved;
};
/*
@@ -198,6 +228,7 @@ typedef struct xfs_mount {
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
+ unsigned int m_max_open_zones;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -222,8 +253,8 @@ typedef struct xfs_mount {
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
- struct percpu_counter m_fdblocks; /* free block counter */
- struct percpu_counter m_frextents; /* free rt extent counter */
+
+ struct xfs_freecounter m_free[XC_FREE_NR];
/*
* Count of data device blocks reserved for delayed allocations,
@@ -245,10 +276,8 @@ typedef struct xfs_mount {
atomic64_t m_allocbt_blks;
struct xfs_groups m_groups[XG_TYPE_MAX];
- uint64_t m_resblks; /* total reserved blocks */
- uint64_t m_resblks_avail;/* available reserved blocks */
- uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
+ struct xfs_zone_info *m_zone_info; /* zone allocator information */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
@@ -258,10 +287,16 @@ typedef struct xfs_mount {
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
struct xchk_stats *m_scrub_stats;
#endif
+ struct xfs_kobj m_zoned_kobj;
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
atomic_t m_rtgrotor; /* last rtgroup rtpicked */
+ struct mutex m_metafile_resv_lock;
+ uint64_t m_metafile_resv_target;
+ uint64_t m_metafile_resv_used;
+ uint64_t m_metafile_resv_avail;
+
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker *m_inodegc_shrinker;
/*
@@ -336,8 +371,10 @@ typedef struct xfs_mount {
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
+#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
+#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
+__XFS_HAS_FEAT(zoned, ZONED)
+__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
@@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
- return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
+ return xfs_has_rtgroups(mp) &&
+ xfs_has_realtime(mp) &&
+ !xfs_has_zoned(mp);
}
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
@@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
xfs_has_reflink(mp);
}
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+ return !xfs_has_zoned(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
+/* Kernel has logged a warning about zoned RT device being used on this fs. */
+#define XFS_OPSTATE_WARNED_ZONED 19
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_ZONEGC_RUNNING 20
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
extern void xfs_uuid_table_free(void);
-extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+uint64_t xfs_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
@@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
+uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+
/*
- * Estimate the amount of free space that is not available to userspace and is
- * not explicitly reserved from the incore fdblocks. This includes:
- *
- * - The minimum number of blocks needed to support splitting a bmap btree
- * - The blocks currently in use by the freespace btrees because they record
- * the actual blocks that will fill per-AG metadata space reservations
+ * Sum up the freecount, but never return negative values.
*/
-static inline uint64_t
-xfs_fdblocks_unavailable(
- struct xfs_mount *mp)
+static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
{
- return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+ return percpu_counter_sum_positive(&mp->m_free[ctr].count);
}
-int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+/*
+ * Same as above, but does return negative values. Mostly useful for
+ * special cases like repair and tracing.
+ */
+static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_sum(&mp->m_free[ctr].count);
+}
+
+/*
+ * This just provides and estimate without the cpu-local updates, use
+ * xfs_sum_freecounter for the exact value.
+ */
+static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_read_positive(&mp->m_free[ctr].count);
+}
+
+static inline int xfs_compare_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, s64 rhs, s32 batch)
+{
+ return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
+}
+
+static inline void xfs_set_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, uint64_t val)
+{
+ percpu_counter_set(&mp->m_free[ctr].count, val);
+}
+
+int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, bool rsvd);
-void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta);
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
bool reserved)
{
- return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+ return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+ xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
+ return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_frextents, delta);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
int64_t ind_delta);
+static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
+{
+ percpu_counter_add(&mp->m_delalloc_blks, delta);
+}
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e1ba5af6250f..417439b58785 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
* immediately. We only support rtquota if rtgroups are enabled to
* avoid problems with older kernels.
*/
- if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
+ if (mp->m_sb.sb_rextents &&
+ (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 59f7fc16eb80..cc3b4df88110 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
if (isnullstartblock(del.br_startblock)) {
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
- &del);
+ &del, 0);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_rtgroup *rtg;
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, fsb);
- rtg = xfs_rtgroup_get(mp, rgno);
- if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
- error = -ENOSPC;
- xfs_rtgroup_put(rtg);
- return error;
+ if (xfs_metafile_resv_critical(mp))
+ return -ENOSPC;
+ return 0;
}
agno = XFS_FSB_TO_AGNO(mp, fsb);
@@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return xfs_zero_range(ip, isize, pos - isize, NULL);
+ return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 57bef567e011..6484c596ecea 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
- kvfree(rtg->rtg_rsum_cache);
+ if (!xfs_has_zoned(rtg_mount(rtg)))
+ kvfree(rtg->rtg_rsum_cache);
}
static int
@@ -837,7 +839,7 @@ xfs_growfs_rt_init_rtsb(
return 0;
error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
- 0, &rtsb_bp);
+ &rtsb_bp);
if (error)
return error;
@@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
return error;
}
+static void
+xfs_growfs_rt_sb_fields(
+ struct xfs_trans *tp,
+ const struct xfs_mount *nmp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+ nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
+ if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
+ if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+ nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
+ if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+ nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
+ if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
+ if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
+ nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+}
+
+static int
+xfs_growfs_rt_zoned(
+ struct xfs_rtgroup *rtg,
+ xfs_rfsblock_t nrblocks)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_mount *nmp;
+ struct xfs_trans *tp;
+ xfs_rtbxlen_t freed_rtx;
+ int error;
+
+ /*
+ * Calculate new sb and mount fields for this round. Also ensure the
+ * rtg_extents value is uptodate as the rtbitmap code relies on it.
+ */
+ nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
+ mp->m_sb.sb_rextsize);
+ if (!nmp)
+ return -ENOMEM;
+ freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
+
+ xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
+ nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
+
+ error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_growfs_rt_sb_fields(tp, nmp);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+
+ /*
+ * Ensure the mount RT feature flag is now set, and compute new
+ * maxlevels for rt btrees.
+ */
+ mp->m_features |= XFS_FEAT_REALTIME;
+ xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
+ xfs_zoned_add_available(mp, freed_rtx);
+out_free:
+ kfree(nmp);
+ return error;
+}
+
static int
xfs_growfs_rt_bmblock(
struct xfs_rtgroup *rtg,
@@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock(
/*
* Update superblock fields.
*/
- if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
- nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
- if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
- nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
- if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
- nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
- if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
- nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
- if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
- nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
- if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
- nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+ xfs_growfs_rt_sb_fields(args.tp, nmp);
/*
* Free the new extent.
@@ -1127,6 +1190,11 @@ xfs_growfs_rtg(
goto out_rele;
}
+ if (xfs_has_zoned(mp)) {
+ error = xfs_growfs_rt_zoned(rtg, nrblocks);
+ goto out_rele;
+ }
+
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
if (error)
goto out_rele;
@@ -1144,10 +1212,8 @@ xfs_growfs_rtg(
goto out_error;
}
- if (old_rsum_cache)
- kvfree(old_rsum_cache);
- xfs_rtgroup_rele(rtg);
- return 0;
+ kvfree(old_rsum_cache);
+ goto out_rele;
out_error:
/*
@@ -1195,6 +1261,22 @@ xfs_growfs_check_rtgeom(
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
+
+ if (xfs_has_zoned(mp)) {
+ uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ uint32_t rem;
+
+ if (rextsize != 1)
+ return -EINVAL;
+ div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
+ if (rem) {
+ xfs_warn(mp,
+"new RT volume size (%lld) not aligned to RT group size (%d)",
+ mp->m_sb.sb_rblocks, gblocks);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1249,6 +1331,35 @@ xfs_grow_last_rtg(
}
/*
+ * Read in the last block of the RT device to make sure it is accessible.
+ */
+static int
+xfs_rt_check_size(
+ struct xfs_mount *mp,
+ xfs_rfsblock_t last_block)
+{
+ xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
+ struct xfs_buf *bp;
+ int error;
+
+ if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
+ xfs_warn(mp, "RT device size overflow: %llu != %llu",
+ XFS_BB_TO_FSB(mp, daddr), last_block);
+ return -EFBIG;
+ }
+
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
+ XFS_FSB_TO_BB(mp, 1), &bp, NULL);
+ if (error)
+ xfs_warn(mp, "cannot read last RT device sector (%lld)",
+ last_block);
+ else
+ xfs_buf_relse(bp);
+ return error;
+}
+
+/*
* Grow the realtime area of the filesystem.
*/
int
@@ -1259,7 +1370,6 @@ xfs_growfs_rt(
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
xfs_rgnumber_t new_rgcount = 1;
xfs_rgnumber_t rgno;
- struct xfs_buf *bp;
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
int error;
@@ -1302,15 +1412,10 @@ xfs_growfs_rt(
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
goto out_unlock;
- /*
- * Read in the last block of the device, make sure it exists.
- */
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- XFS_FSB_TO_BB(mp, in->newblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+
+ error = xfs_rt_check_size(mp, in->newblocks - 1);
if (error)
goto out_unlock;
- xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
@@ -1376,8 +1481,7 @@ xfs_growfs_rt(
error = error2;
/* Reset the rt metadata btree space reservations. */
- xfs_rt_resv_free(mp);
- error2 = xfs_rt_resv_init(mp);
+ error2 = xfs_metafile_resv_init(mp);
if (error2 && error2 != -ENOSPC)
error = error2;
}
@@ -1407,7 +1511,7 @@ xfs_rtmount_readsb(
/* m_blkbb_log is not set up yet */
error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
- mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp,
+ mp->m_sb.sb_blocksize >> BBSHIFT, &bp,
&xfs_rtsb_buf_ops);
if (error) {
xfs_warn(mp, "rt sb validate failed with error %d.", error);
@@ -1444,10 +1548,6 @@ int /* error */
xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
- struct xfs_buf *bp; /* buffer for last block of subvolume */
- xfs_daddr_t d; /* address of last block of subvolume */
- int error;
-
if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
@@ -1458,25 +1558,7 @@ xfs_rtmount_init(
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
- /*
- * Check that the realtime section is an ok size.
- */
- d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
- xfs_warn(mp, "realtime mount -- %llu != %llu",
- (unsigned long long) XFS_BB_TO_FSB(mp, d),
- (unsigned long long) mp->m_sb.sb_rblocks);
- return -EFBIG;
- }
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
- if (error) {
- xfs_warn(mp, "realtime device size check failed");
- return error;
- }
- xfs_buf_relse(bp);
- return 0;
+ return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
}
static int
@@ -1519,50 +1601,10 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
return 0;
}
-/* Free space reservations for rt metadata inodes. */
-void
-xfs_rt_resv_free(
- struct xfs_mount *mp)
-{
- struct xfs_rtgroup *rtg = NULL;
- unsigned int i;
-
- while ((rtg = xfs_rtgroup_next(mp, rtg))) {
- for (i = 0; i < XFS_RTGI_MAX; i++)
- xfs_metafile_resv_free(rtg->rtg_inodes[i]);
- }
-}
-
-/* Reserve space for rt metadata inodes' space expansion. */
-int
-xfs_rt_resv_init(
- struct xfs_mount *mp)
-{
- struct xfs_rtgroup *rtg = NULL;
- xfs_filblks_t ask;
- int error = 0;
-
- while ((rtg = xfs_rtgroup_next(mp, rtg))) {
- int err2;
-
- ask = xfs_rtrmapbt_calc_reserves(mp);
- err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
- if (err2 && !error)
- error = err2;
-
- ask = xfs_rtrefcountbt_calc_reserves(mp);
- err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
- if (err2 && !error)
- error = err2;
- }
-
- return error;
-}
-
/*
* Read in the bmbt of an rt metadata inode so that we never have to load them
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
@@ -1613,6 +1655,8 @@ xfs_rtmount_rtg(
}
}
+ if (xfs_has_zoned(mp))
+ return 0;
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
}
@@ -2097,6 +2141,8 @@ xfs_bmap_rtalloc(
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
+ ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 0d95b29092c9..78a690b489ed 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -34,9 +34,6 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
-void xfs_rt_resv_free(struct xfs_mount *mp);
-int xfs_rt_resv_init(struct xfs_mount *mp);
-
/*
* Grow the realtime area of the filesystem.
*/
@@ -65,8 +62,6 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
-# define xfs_rt_resv_free(mp) ((void)0)
-# define xfs_rt_resv_init(mp) (0)
static inline int
xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0055066fb1d9..53944cc7af24 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -109,7 +110,8 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+ Opt_lifetime, Opt_nolifetime,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+ fsparam_u32("max_open_zones", Opt_max_open_zones),
+ fsparam_flag("lifetime", Opt_lifetime),
+ fsparam_flag("nolifetime", Opt_nolifetime),
{}
};
@@ -182,6 +187,7 @@ xfs_fs_show_options(
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
+ { XFS_FEAT_NOLIFETIME, ",nolifetime" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
@@ -233,6 +239,9 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
+ if (mp->m_max_open_zones)
+ seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+
return 0;
}
@@ -533,7 +542,15 @@ xfs_setup_devices(
if (error)
return error;
}
- if (mp->m_rtdev_targp) {
+
+ if (mp->m_sb.sb_rtstart) {
+ if (mp->m_rtdev_targp) {
+ xfs_warn(mp,
+ "can't use internal and external rtdev at the same time");
+ return -EINVAL;
+ }
+ mp->m_rtdev_targp = mp->m_ddev_targp;
+ } else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
@@ -757,7 +774,7 @@ xfs_mount_free(
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
- if (mp->m_rtdev_targp)
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_rtdev_targp);
if (mp->m_ddev_targp)
xfs_free_buftarg(mp->m_ddev_targp);
@@ -814,6 +831,7 @@ xfs_fs_sync_fs(
if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
xfs_inodegc_stop(mp);
xfs_blockgc_stop(mp);
+ xfs_zone_gc_stop(mp);
}
return 0;
@@ -834,10 +852,12 @@ xfs_statfs_data(
struct kstatfs *st)
{
int64_t fdblocks =
- percpu_counter_sum(&mp->m_fdblocks);
+ xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
/* make sure st->f_bfree does not underflow */
- st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+ st->f_bfree = max(0LL,
+ fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
+
/*
* sb_dblocks can change during growfs, but nothing cares about reporting
* the old or new value during growfs.
@@ -856,8 +876,9 @@ xfs_statfs_rt(
struct kstatfs *st)
{
st->f_bfree = xfs_rtbxlen_to_blen(mp,
- percpu_counter_sum_positive(&mp->m_frextents));
- st->f_blocks = mp->m_sb.sb_rblocks;
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
+ mp->m_free[XC_FREE_RTEXTENTS].res_total);
}
static void
@@ -922,24 +943,32 @@ xfs_fs_statfs(
}
STATIC void
-xfs_save_resvblks(struct xfs_mount *mp)
+xfs_save_resvblks(
+ struct xfs_mount *mp)
{
- mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, 0);
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++) {
+ mp->m_free[i].res_saved = mp->m_free[i].res_total;
+ xfs_reserve_blocks(mp, i, 0);
+ }
}
STATIC void
-xfs_restore_resvblks(struct xfs_mount *mp)
+xfs_restore_resvblks(
+ struct xfs_mount *mp)
{
- uint64_t resblks;
+ uint64_t resblks;
+ enum xfs_free_counter i;
- if (mp->m_resblks_save) {
- resblks = mp->m_resblks_save;
- mp->m_resblks_save = 0;
- } else
- resblks = xfs_default_resblks(mp);
-
- xfs_reserve_blocks(mp, resblks);
+ for (i = 0; i < XC_FREE_NR; i++) {
+ if (mp->m_free[i].res_saved) {
+ resblks = mp->m_free[i].res_saved;
+ mp->m_free[i].res_saved = 0;
+ } else
+ resblks = xfs_default_resblks(mp, i);
+ xfs_reserve_blocks(mp, i, resblks);
+ }
}
/*
@@ -976,6 +1005,7 @@ xfs_fs_freeze(
if (ret && !xfs_is_readonly(mp)) {
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
+ xfs_zone_gc_start(mp);
}
return ret;
@@ -997,6 +1027,7 @@ xfs_fs_unfreeze(
* filesystem.
*/
if (!xfs_is_readonly(mp)) {
+ xfs_zone_gc_start(mp);
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
}
@@ -1058,6 +1089,19 @@ xfs_finish_flags(
return -EINVAL;
}
+ if (!xfs_has_zoned(mp)) {
+ if (mp->m_max_open_zones) {
+ xfs_warn(mp,
+"max_open_zones mount option only supported on zoned file systems.");
+ return -EINVAL;
+ }
+ if (mp->m_features & XFS_FEAT_NOLIFETIME) {
+ xfs_warn(mp,
+"nolifetime mount option only supported on zoned file systems.");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1065,7 +1109,8 @@ static int
xfs_init_percpu_counters(
struct xfs_mount *mp)
{
- int error;
+ int error;
+ int i;
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
if (error)
@@ -1075,30 +1120,29 @@ xfs_init_percpu_counters(
if (error)
goto free_icount;
- error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
- if (error)
- goto free_ifree;
-
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
if (error)
- goto free_fdblocks;
+ goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
- error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
- if (error)
- goto free_delalloc_rt;
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = percpu_counter_init(&mp->m_free[i].count, 0,
+ GFP_KERNEL);
+ if (error)
+ goto free_freecounters;
+ }
return 0;
-free_delalloc_rt:
+free_freecounters:
+ while (--i > 0)
+ percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
- percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters(
{
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
- percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
+ if (!xfs_has_zoned(mp))
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ mp->m_sb.sb_frextents);
}
static void
xfs_destroy_percpu_counters(
struct xfs_mount *mp)
{
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++)
+ percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
- percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
- percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1210,6 +1258,18 @@ xfs_fs_shutdown(
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
+static int
+xfs_fs_show_stats(
+ struct seq_file *m,
+ struct dentry *root)
+{
+ struct xfs_mount *mp = XFS_M(root->d_sb);
+
+ if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+ xfs_zoned_show_stats(m, mp);
+ return 0;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = {
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
+ .show_stats = xfs_fs_show_stats,
};
static int
@@ -1436,6 +1497,15 @@ xfs_fs_parse_param(
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
+ case Opt_max_open_zones:
+ parsing_mp->m_max_open_zones = result.uint_32;
+ return 0;
+ case Opt_lifetime:
+ parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
+ return 0;
+ case Opt_nolifetime:
+ parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1780,8 +1850,17 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
- if (xfs_has_metadir(mp))
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_has_metadir(mp)) {
+ xfs_alert(mp,
+ "metadir feature required for zoned realtime devices.");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
+ } else if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+ }
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
@@ -1793,6 +1872,13 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_zoned(mp)) {
+ xfs_alert(mp,
+ "reflink not compatible with zoned RT device!");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
if (xfs_globals.always_cow) {
xfs_info(mp, "using DEBUG-only always_cow mode.");
mp->m_always_cow = true;
@@ -1917,6 +2003,9 @@ xfs_remount_rw(
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
+ /* Restart zone reclaim */
+ xfs_zone_gc_start(mp);
+
return 0;
}
@@ -1961,6 +2050,9 @@ xfs_remount_ro(
*/
xfs_inodegc_stop(mp);
+ /* Stop zone reclaim */
+ xfs_zone_gc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
@@ -2082,6 +2174,7 @@ xfs_init_fs_context(
for (i = 0; i < XG_TYPE_MAX; i++)
xa_init(&mp->m_groups[i].xa);
mutex_init(&mp->m_growlock);
+ mutex_init(&mp->m_metafile_resv_lock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
mp->m_kobj.kobject.kset = xfs_kset;
@@ -2122,7 +2215,8 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 60cb5318fdae..b0857e3c1270 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -13,6 +13,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
+#include "xfs_zones.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_mp);
-const struct kobj_type xfs_mp_ktype = {
+static const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
@@ -701,45 +702,103 @@ out_error:
return error;
}
+static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj)
+{
+ return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj);
+}
+
+static ssize_t
+max_open_zones_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ /* only report the open zones available for user data */
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES);
+}
+XFS_SYSFS_ATTR_RO(max_open_zones);
+
+static struct attribute *xfs_zoned_attrs[] = {
+ ATTR_LIST(max_open_zones),
+ NULL,
+};
+ATTRIBUTE_GROUPS(xfs_zoned);
+
+static const struct kobj_type xfs_zoned_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_groups = xfs_zoned_groups,
+};
+
int
-xfs_error_sysfs_init(
+xfs_mount_sysfs_init(
struct xfs_mount *mp)
{
int error;
+ super_set_sysfs_name_id(mp->m_super);
+
+ /* .../xfs/<dev>/ */
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+ NULL, mp->m_super->s_id);
+ if (error)
+ return error;
+
+ /* .../xfs/<dev>/stats/ */
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
+ if (error)
+ goto out_remove_fsdir;
+
/* .../xfs/<dev>/error/ */
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
&mp->m_kobj, "error");
if (error)
- return error;
+ goto out_remove_stats_dir;
+ /* .../xfs/<dev>/error/fail_at_unmount */
error = sysfs_create_file(&mp->m_error_kobj.kobject,
ATTR_LIST(fail_at_unmount));
if (error)
- goto out_error;
+ goto out_remove_error_dir;
/* .../xfs/<dev>/error/metadata/ */
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
"metadata", &mp->m_error_meta_kobj,
xfs_error_meta_init);
if (error)
- goto out_error;
+ goto out_remove_error_dir;
+
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) {
+ /* .../xfs/<dev>/zoned/ */
+ error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype,
+ &mp->m_kobj, "zoned");
+ if (error)
+ goto out_remove_error_dir;
+ }
return 0;
-out_error:
+out_remove_error_dir:
xfs_sysfs_del(&mp->m_error_kobj);
+out_remove_stats_dir:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
+out_remove_fsdir:
+ xfs_sysfs_del(&mp->m_kobj);
return error;
}
void
-xfs_error_sysfs_del(
+xfs_mount_sysfs_del(
struct xfs_mount *mp)
{
struct xfs_error_cfg *cfg;
int i, j;
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ xfs_sysfs_del(&mp->m_zoned_kobj);
+
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
cfg = &mp->m_error_cfg[i][j];
@@ -749,6 +808,8 @@ xfs_error_sysfs_del(
}
xfs_sysfs_del(&mp->m_error_meta_kobj);
xfs_sysfs_del(&mp->m_error_kobj);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
+ xfs_sysfs_del(&mp->m_kobj);
}
struct xfs_error_cfg *
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 148893ebfdef..1622fe80ad3e 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -7,7 +7,6 @@
#ifndef __XFS_SYSFS_H__
#define __XFS_SYSFS_H__
-extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern const struct kobj_type xfs_dbg_ktype; /* debug */
extern const struct kobj_type xfs_log_ktype; /* xlog */
extern const struct kobj_type xfs_stats_ktype; /* stats */
@@ -53,7 +52,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
-int xfs_error_sysfs_init(struct xfs_mount *mp);
-void xfs_error_sysfs_del(struct xfs_mount *mp);
+int xfs_mount_sysfs_init(struct xfs_mount *mp);
+void xfs_mount_sysfs_del(struct xfs_mount *mp);
#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8f530e69c18a..a60556dbd172 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,8 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bfc2f1249022..e56ba1963160 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
+struct xfs_open_zone;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+ TP_PROTO(struct xfs_rtgroup *rtg),
+ TP_ARGS(rtg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(unsigned int, nr_open)
+ ),
+ TP_fast_assign(
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->nr_open)
+);
+
+#define DEFINE_ZONE_EVENT(name) \
+DEFINE_EVENT(xfs_zone_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg), \
+ TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_opened);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
+
+TRACE_EVENT(xfs_zone_free_blocks,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(rtg, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->rgbno,
+ __entry->len)
+);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(oz, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, written)
+ __field(xfs_rgblock_t, write_pointer)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(oz->oz_rtg);
+ __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+ __entry->written = oz->oz_written;
+ __entry->write_pointer = oz->oz_write_pointer;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->written,
+ __entry->write_pointer,
+ __entry->rgbno,
+ __entry->len)
+);
+
+#define DEFINE_ZONE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_zone_alloc_class, name, \
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(oz, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zone_gc_select_victim,
+ TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
+ TP_ARGS(rtg, bucket),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(unsigned int, bucket)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->bucket = bucket;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->bucket)
+);
+
+TRACE_EVENT(xfs_zones_mount,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgcount)
+ __field(uint32_t, blocks)
+ __field(unsigned int, max_open_zones)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgcount = mp->m_sb.sb_rgcount;
+ __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ __entry->max_open_zones = mp->m_max_open_zones;
+ ),
+ TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgcount,
+ __entry->blocks,
+ __entry->max_open_zones)
+);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@@ -545,6 +692,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+DEFINE_BUF_EVENT(xfs_buf_backing_folio);
+DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
+DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
+DEFINE_BUF_EVENT(xfs_buf_backing_fallback);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
@@ -1596,6 +1747,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -3983,6 +4135,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -5606,11 +5759,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup);
/* metadata inode space reservations */
DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
- TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
- TP_ARGS(ip, len),
+ TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
+ TP_ARGS(mp, len),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_ino_t, ino)
__field(unsigned long long, freeblks)
__field(unsigned long long, reserved)
__field(unsigned long long, asked)
@@ -5618,19 +5770,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
__field(unsigned long long, len)
),
TP_fast_assign(
- struct xfs_mount *mp = ip->i_mount;
-
__entry->dev = mp->m_super->s_dev;
- __entry->ino = ip->i_ino;
- __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
- __entry->reserved = ip->i_delayed_blks;
- __entry->asked = ip->i_meta_resv_asked;
- __entry->used = ip->i_nblocks;
+ __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ __entry->reserved = mp->m_metafile_resv_avail;
+ __entry->asked = mp->m_metafile_resv_target;
+ __entry->used = mp->m_metafile_resv_used;
__entry->len = len;
),
- TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+ TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
__entry->freeblks,
__entry->reserved,
__entry->asked,
@@ -5639,14 +5787,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
)
#define DEFINE_METAFILE_RESV_EVENT(name) \
DEFINE_EVENT(xfs_metafile_resv_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
- TP_ARGS(ip, len))
+ TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
+ TP_ARGS(mp, len))
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
-DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_growfs_check_rtgeom,
@@ -5669,6 +5817,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
);
#endif /* CONFIG_XFS_RT */
+TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
+TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
+
+DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
+ TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
+ uint64_t delta, unsigned long caller_ip),
+ TP_ARGS(mp, ctr, delta, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_free_counter, ctr)
+ __field(uint64_t, delta)
+ __field(uint64_t, avail)
+ __field(uint64_t, total)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ctr = ctr;
+ __entry->delta = delta;
+ __entry->avail = mp->m_free[ctr].res_avail;
+ __entry->total = mp->m_free[ctr].res_total;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
+ __entry->delta,
+ __entry->avail,
+ __entry->total,
+ (char *)__entry->caller_ip)
+)
+#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
+ uint64_t delta, unsigned long caller_ip), \
+ TP_ARGS(mp, ctr, delta, caller_ip))
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644
index 000000000000..52af234936a2
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,1220 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_error.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_refcount.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+void
+xfs_open_zone_put(
+ struct xfs_open_zone *oz)
+{
+ if (atomic_dec_and_test(&oz->oz_ref)) {
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+ }
+}
+
+static inline uint32_t
+xfs_zone_bucket(
+ struct xfs_mount *mp,
+ uint32_t used_blocks)
+{
+ return XFS_ZONE_USED_BUCKETS * used_blocks /
+ mp->m_groups[XG_TYPE_RTG].blocks;
+}
+
+static inline void
+xfs_zone_add_to_bucket(
+ struct xfs_zone_info *zi,
+ xfs_rgnumber_t rgno,
+ uint32_t to_bucket)
+{
+ __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]);
+ zi->zi_used_bucket_entries[to_bucket]++;
+}
+
+static inline void
+xfs_zone_remove_from_bucket(
+ struct xfs_zone_info *zi,
+ xfs_rgnumber_t rgno,
+ uint32_t from_bucket)
+{
+ __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]);
+ zi->zi_used_bucket_entries[from_bucket]--;
+}
+
+static void
+xfs_zone_account_reclaimable(
+ struct xfs_rtgroup *rtg,
+ uint32_t freed)
+{
+ struct xfs_group *xg = &rtg->rtg_group;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
+ xfs_rgnumber_t rgno = rtg_rgno(rtg);
+ uint32_t from_bucket = xfs_zone_bucket(mp, used + freed);
+ uint32_t to_bucket = xfs_zone_bucket(mp, used);
+ bool was_full = (used + freed == rtg_blocks(rtg));
+
+ /*
+ * This can be called from log recovery, where the zone_info structure
+ * hasn't been allocated yet. Skip all work as xfs_mount_zones will
+ * add the zones to the right buckets before the file systems becomes
+ * active.
+ */
+ if (!zi)
+ return;
+
+ if (!used) {
+ /*
+ * The zone is now empty, remove it from the bottom bucket and
+ * trigger a reset.
+ */
+ trace_xfs_zone_emptied(rtg);
+
+ if (!was_full)
+ xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ if (!was_full)
+ xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ spin_lock(&zi->zi_reset_list_lock);
+ xg->xg_next_reset = zi->zi_reset_list;
+ zi->zi_reset_list = xg;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (zi->zi_gc_thread)
+ wake_up_process(zi->zi_gc_thread);
+ } else if (was_full) {
+ /*
+ * The zone transitioned from full, mark it up as reclaimable
+ * and wake up GC which might be waiting for zones to reclaim.
+ */
+ spin_lock(&zi->zi_used_buckets_lock);
+ xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
+ if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
+ wake_up_process(zi->zi_gc_thread);
+ } else if (to_bucket != from_bucket) {
+ /*
+ * Move the zone to a new bucket if it dropped below the
+ * threshold.
+ */
+ spin_lock(&zi->zi_used_buckets_lock);
+ xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+ xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+ }
+}
+
+static void
+xfs_open_zone_mark_full(
+ struct xfs_open_zone *oz)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
+
+ trace_xfs_zone_full(rtg);
+
+ WRITE_ONCE(rtg->rtg_open_zone, NULL);
+
+ spin_lock(&zi->zi_open_zones_lock);
+ if (oz->oz_is_gc) {
+ ASSERT(current == zi->zi_gc_thread);
+ zi->zi_open_gc_zone = NULL;
+ } else {
+ zi->zi_nr_open_zones--;
+ list_del_init(&oz->oz_entry);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+ xfs_open_zone_put(oz);
+
+ wake_up_all(&zi->zi_zone_wait);
+ if (used < rtg_blocks(rtg))
+ xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
+}
+
+static void
+xfs_zone_record_blocks(
+ struct xfs_trans *tp,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len,
+ struct xfs_open_zone *oz,
+ bool used)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
+
+ trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ if (used) {
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+ } else {
+ xfs_add_frextents(mp, len);
+ }
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+}
+
+static int
+xfs_zoned_map_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *new,
+ struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_bmbt_irec data;
+ int nmaps = 1;
+ int error;
+
+ /* Grab the corresponding mapping in the data fork. */
+ error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+ &nmaps, 0);
+ if (error)
+ return error;
+
+ /*
+ * Cap the update to the existing extent in the data fork because we can
+ * only overwrite one extent at a time.
+ */
+ ASSERT(new->br_blockcount >= data.br_blockcount);
+ new->br_blockcount = data.br_blockcount;
+
+ /*
+ * If a data write raced with this GC write, keep the existing data in
+ * the data fork, mark our newly written GC extent as reclaimable, then
+ * move on to the next extent.
+ */
+ if (old_startblock != NULLFSBLOCK &&
+ old_startblock != data.br_startblock)
+ goto skip;
+
+ trace_xfs_reflink_cow_remap_from(ip, new);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ return error;
+
+ if (data.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+ ASSERT(!isnullstartblock(data.br_startblock));
+
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+ if (xfs_is_reflink_inode(ip)) {
+ xfs_refcount_decrease_extent(tp, true, &data);
+ } else {
+ error = xfs_free_extent_later(tp, data.br_startblock,
+ data.br_blockcount, NULL,
+ XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_REALTIME);
+ if (error)
+ return error;
+ }
+ }
+
+ xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+ true);
+
+ /* Map the new blocks into the data fork. */
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+ return 0;
+
+skip:
+ trace_xfs_reflink_cow_remap_skip(ip, new);
+ xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+ false);
+ return 0;
+}
+
+int
+xfs_zoned_end_io(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ xfs_daddr_t daddr,
+ struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ struct xfs_bmbt_irec new = {
+ .br_startoff = XFS_B_TO_FSBT(mp, offset),
+ .br_startblock = xfs_daddr_to_rtb(mp, daddr),
+ .br_state = XFS_EXT_NORM,
+ };
+ unsigned int resblks =
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ struct xfs_trans *tp;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ while (new.br_startoff < end_fsb) {
+ new.br_blockcount = end_fsb - new.br_startoff;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ new.br_startoff += new.br_blockcount;
+ new.br_startblock += new.br_blockcount;
+ if (old_startblock != NULLFSBLOCK)
+ old_startblock += new.br_blockcount;
+ }
+
+ return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
+
+ xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
+
+ if (len > rmapip->i_used_blocks) {
+ xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%u).",
+ len, rmapip->i_used_blocks);
+ ASSERT(len <= rmapip->i_used_blocks);
+ xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ rmapip->i_used_blocks -= len;
+ /*
+ * Don't add open zones to the reclaimable buckets. The I/O completion
+ * for writing the last block will take care of accounting for already
+ * unused blocks instead.
+ */
+ if (!READ_ONCE(rtg->rtg_open_zone))
+ xfs_zone_account_reclaimable(rtg, len);
+ xfs_add_frextents(mp, len);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_open_zone *
+xfs_last_used_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_open_zone *oz = NULL;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+ &icur, &got)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return NULL;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+ if (!rtg)
+ return NULL;
+
+ xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+ oz = READ_ONCE(rtg->rtg_open_zone);
+ if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
+ oz = NULL;
+ xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+
+ xfs_rtgroup_rele(rtg);
+ return oz;
+}
+
+static struct xfs_group *
+xfs_find_free_zone(
+ struct xfs_mount *mp,
+ unsigned long start,
+ unsigned long end)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
+ struct xfs_group *xg;
+
+ xas_lock(&xas);
+ xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
+ if (atomic_inc_not_zero(&xg->xg_active_ref))
+ goto found;
+ xas_unlock(&xas);
+ return NULL;
+
+found:
+ xas_clear_mark(&xas, XFS_RTG_FREE);
+ atomic_dec(&zi->zi_nr_free_zones);
+ zi->zi_free_zone_cursor = xg->xg_gno;
+ xas_unlock(&xas);
+ return xg;
+}
+
+static struct xfs_open_zone *
+xfs_init_open_zone(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t write_pointer,
+ enum rw_hint write_hint,
+ bool is_gc)
+{
+ struct xfs_open_zone *oz;
+
+ oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
+ spin_lock_init(&oz->oz_alloc_lock);
+ atomic_set(&oz->oz_ref, 1);
+ oz->oz_rtg = rtg;
+ oz->oz_write_pointer = write_pointer;
+ oz->oz_written = write_pointer;
+ oz->oz_write_hint = write_hint;
+ oz->oz_is_gc = is_gc;
+
+ /*
+ * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
+ * inode, but we don't really want to take that here because we are
+ * under the zone_list_lock. Ensure the pointer is only set for a fully
+ * initialized open zone structure so that a racy lookup finding it is
+ * fine.
+ */
+ WRITE_ONCE(rtg->rtg_open_zone, oz);
+ return oz;
+}
+
+/*
+ * Find a completely free zone, open it, and return a reference.
+ */
+struct xfs_open_zone *
+xfs_open_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool is_gc)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_group *xg;
+
+ xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
+ if (!xg)
+ xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
+ if (!xg)
+ return NULL;
+
+ set_current_state(TASK_RUNNING);
+ return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
+}
+
+static struct xfs_open_zone *
+xfs_try_open_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
+ return NULL;
+ if (atomic_read(&zi->zi_nr_free_zones) <
+ XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+ return NULL;
+
+ /*
+ * Increment the open zone count to reserve our slot before dropping
+ * zi_open_zones_lock.
+ */
+ zi->zi_nr_open_zones++;
+ spin_unlock(&zi->zi_open_zones_lock);
+ oz = xfs_open_zone(mp, write_hint, false);
+ spin_lock(&zi->zi_open_zones_lock);
+ if (!oz) {
+ zi->zi_nr_open_zones--;
+ return NULL;
+ }
+
+ atomic_inc(&oz->oz_ref);
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+
+ /*
+ * If this was the last free zone, other waiters might be waiting
+ * on us to write to it as well.
+ */
+ wake_up_all(&zi->zi_zone_wait);
+
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(zi->zi_gc_thread);
+
+ trace_xfs_zone_opened(oz->oz_rtg);
+ return oz;
+}
+
+/*
+ * For data with short or medium lifetime, try to colocated it into an
+ * already open zone with a matching temperature.
+ */
+static bool
+xfs_colocate_eagerly(
+ enum rw_hint file_hint)
+{
+ switch (file_hint) {
+ case WRITE_LIFE_MEDIUM:
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+xfs_good_hint_match(
+ struct xfs_open_zone *oz,
+ enum rw_hint file_hint)
+{
+ switch (oz->oz_write_hint) {
+ case WRITE_LIFE_LONG:
+ case WRITE_LIFE_EXTREME:
+ /* colocate long and extreme */
+ if (file_hint == WRITE_LIFE_LONG ||
+ file_hint == WRITE_LIFE_EXTREME)
+ return true;
+ break;
+ case WRITE_LIFE_MEDIUM:
+ /* colocate medium with medium */
+ if (file_hint == WRITE_LIFE_MEDIUM)
+ return true;
+ break;
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ case WRITE_LIFE_NOT_SET:
+ /* colocate short and none */
+ if (file_hint <= WRITE_LIFE_SHORT)
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool
+xfs_try_use_zone(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint,
+ struct xfs_open_zone *oz,
+ bool lowspace)
+{
+ if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ return false;
+ if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+ return false;
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ return false;
+
+ /*
+ * If we have a hint set for the data, use that for the zone even if
+ * some data was written already without any hint set, but don't change
+ * the temperature after that as that would make little sense without
+ * tracking per-temperature class written block counts, which is
+ * probably overkill anyway.
+ */
+ if (file_hint != WRITE_LIFE_NOT_SET &&
+ oz->oz_write_hint == WRITE_LIFE_NOT_SET)
+ oz->oz_write_hint = file_hint;
+
+ /*
+ * If we couldn't match by inode or life time we just pick the first
+ * zone with enough space above. For that we want the least busy zone
+ * for some definition of "least" busy. For now this simple LRU
+ * algorithm that rotates every zone to the end of the list will do it,
+ * even if it isn't exactly cache friendly.
+ */
+ if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
+ list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_lru(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint,
+ bool lowspace)
+{
+ struct xfs_open_zone *oz;
+
+ lockdep_assert_held(&zi->zi_open_zones_lock);
+
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+ return oz;
+
+ cond_resched_lock(&zi->zi_open_zones_lock);
+ return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_mru(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint)
+{
+ struct xfs_open_zone *oz;
+
+ lockdep_assert_held(&zi->zi_open_zones_lock);
+
+ list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, file_hint, oz, false))
+ return oz;
+
+ cond_resched_lock(&zi->zi_open_zones_lock);
+ return NULL;
+}
+
+static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
+{
+ if (xfs_has_nolifetime(ip->i_mount))
+ return WRITE_LIFE_NOT_SET;
+ return VFS_I(ip)->i_write_hint;
+}
+
+/*
+ * Try to pack inodes that are written back after they were closed tight instead
+ * of trying to open new zones for them or spread them to the least recently
+ * used zone. This optimizes the data layout for workloads that untar or copy
+ * a lot of small files. Right now this does not separate multiple such
+ * streams.
+ */
+static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
+{
+ return !inode_is_open_for_write(VFS_I(ip)) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND);
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from the
+ * freelist. Else try to find one that matches the expected data lifetime. If
+ * we don't find one that is good pick any zone that is available.
+ */
+static struct xfs_open_zone *
+xfs_select_zone_nowait(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool pack_tight)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = NULL;
+
+ if (xfs_is_shutdown(mp))
+ return NULL;
+
+ /*
+ * Try to fill up open zones with matching temperature if available. It
+ * is better to try to co-locate data when this is favorable, so we can
+ * activate empty zones when it is statistically better to separate
+ * data.
+ */
+ spin_lock(&zi->zi_open_zones_lock);
+ if (xfs_colocate_eagerly(write_hint))
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ else if (pack_tight)
+ oz = xfs_select_open_zone_mru(zi, write_hint);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * See if we can open a new zone and use that.
+ */
+ oz = xfs_try_open_zone(mp, write_hint);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * Try to colocate cold data with other cold data if we failed to open a
+ * new zone for it.
+ */
+ if (write_hint != WRITE_LIFE_NOT_SET &&
+ !xfs_colocate_eagerly(write_hint))
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ if (!oz)
+ oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
+ if (!oz)
+ oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+out_unlock:
+ spin_unlock(&zi->zi_open_zones_lock);
+ return oz;
+}
+
+static struct xfs_open_zone *
+xfs_select_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool pack_tight)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ DEFINE_WAIT (wait);
+ struct xfs_open_zone *oz;
+
+ oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
+ if (oz)
+ return oz;
+
+ for (;;) {
+ prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+ oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
+ if (oz)
+ break;
+ schedule();
+ }
+ finish_wait(&zi->zi_zone_wait, &wait);
+ return oz;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t count_fsb,
+ sector_t *sector,
+ bool *is_seq)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rgblock_t rgbno;
+
+ spin_lock(&oz->oz_alloc_lock);
+ count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+ if (!count_fsb) {
+ spin_unlock(&oz->oz_alloc_lock);
+ return 0;
+ }
+ rgbno = oz->oz_write_pointer;
+ oz->oz_write_pointer += count_fsb;
+ spin_unlock(&oz->oz_alloc_lock);
+
+ trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+
+ *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
+ if (!*is_seq)
+ *sector += XFS_FSB_TO_BB(mp, rgbno);
+ return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+void
+xfs_mark_rtg_boundary(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ sector_t sector = ioend->io_bio.bi_iter.bi_sector;
+
+ if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+ ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone *oz,
+ bool is_seq)
+{
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ ioend->io_private = oz;
+ atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
+
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
+ }
+
+ submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ enum rw_hint write_hint = xfs_inode_write_hint(ip);
+ bool pack_tight = xfs_zoned_pack_tight(ip);
+ unsigned int alloc_len;
+ struct iomap_ioend *split;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ goto out_error;
+
+ /*
+ * If we don't have a cached zone in this write context, see if the
+ * last extent before the one we are writing to points to an active
+ * zone. If so, just continue writing to it.
+ */
+ if (!*oz && ioend->io_offset)
+ *oz = xfs_last_used_zone(ioend);
+ if (!*oz) {
+select_zone:
+ *oz = xfs_select_zone(mp, write_hint, pack_tight);
+ if (!*oz)
+ goto out_error;
+ }
+
+ alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
+ &ioend->io_sector, &is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+
+ while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
+ if (IS_ERR(split))
+ goto out_split_error;
+ alloc_len -= split->io_bio.bi_iter.bi_size;
+ xfs_submit_zoned_bio(split, *oz, is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+ }
+
+ xfs_submit_zoned_bio(ioend, *oz, is_seq);
+ return;
+
+out_split_error:
+ ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
+out_error:
+ bio_io_error(&ioend->io_bio);
+}
+
+/*
+ * Wake up all threads waiting for a zoned space allocation when the file system
+ * is shut down.
+ */
+void
+xfs_zoned_wake_all(
+ struct xfs_mount *mp)
+{
+ /*
+ * Don't wake up if there is no m_zone_info. This is complicated by the
+ * fact that unmount can't atomically clear m_zone_info and thus we need
+ * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE
+ * during log recovery so we can't entirely rely on that either.
+ */
+ if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info)
+ wake_up_all(&mp->m_zone_info->zi_zone_wait);
+}
+
+/*
+ * Check if @rgbno in @rgb is a potentially valid block. It might still be
+ * unused, but that information is only found in the rmap.
+ */
+bool
+xfs_zone_rgbno_is_valid(
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgbno)
+{
+ lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
+
+ if (rtg->rtg_open_zone)
+ return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+ return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
+ rtg_rgno(rtg), XFS_RTG_FREE);
+}
+
+static void
+xfs_free_open_zones(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
+ struct xfs_open_zone, oz_entry))) {
+ list_del(&oz->oz_entry);
+ xfs_open_zone_put(oz);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+}
+
+struct xfs_init_zones {
+ struct xfs_mount *mp;
+ uint64_t available;
+ uint64_t reclaimable;
+};
+
+static int
+xfs_init_zone(
+ struct xfs_init_zones *iz,
+ struct xfs_rtgroup *rtg,
+ struct blk_zone *zone)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint64_t used = rtg_rmap(rtg)->i_used_blocks;
+ xfs_rgblock_t write_pointer, highest_rgbno;
+ int error;
+
+ if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
+ return -EFSCORRUPTED;
+
+ /*
+ * For sequential write required zones we retrieved the hardware write
+ * pointer above.
+ *
+ * For conventional zones or conventional devices we don't have that
+ * luxury. Instead query the rmap to find the highest recorded block
+ * and set the write pointer to the block after that. In case of a
+ * power loss this misses blocks where the data I/O has completed but
+ * not recorded in the rmap yet, and it also rewrites blocks if the most
+ * recently written ones got deleted again before unmount, but this is
+ * the best we can do without hardware support.
+ */
+ if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
+ if (highest_rgbno == NULLRGBLOCK)
+ write_pointer = 0;
+ else
+ write_pointer = highest_rgbno + 1;
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ }
+
+ /*
+ * If there are no used blocks, but the zone is not in empty state yet
+ * we lost power before the zoned reset. In that case finish the work
+ * here.
+ */
+ if (write_pointer == rtg_blocks(rtg) && used == 0) {
+ error = xfs_zone_gc_reset_sync(rtg);
+ if (error)
+ return error;
+ write_pointer = 0;
+ }
+
+ if (write_pointer == 0) {
+ /* zone is empty */
+ atomic_inc(&zi->zi_nr_free_zones);
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ iz->available += rtg_blocks(rtg);
+ } else if (write_pointer < rtg_blocks(rtg)) {
+ /* zone is open */
+ struct xfs_open_zone *oz;
+
+ atomic_inc(&rtg_group(rtg)->xg_active_ref);
+ oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
+ false);
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+ zi->zi_nr_open_zones++;
+
+ iz->available += (rtg_blocks(rtg) - write_pointer);
+ iz->reclaimable += write_pointer - used;
+ } else if (used < rtg_blocks(rtg)) {
+ /* zone fully written, but has freed blocks */
+ xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
+ iz->reclaimable += (rtg_blocks(rtg) - used);
+ }
+
+ return 0;
+}
+
+static int
+xfs_get_zone_info_cb(
+ struct blk_zone *zone,
+ unsigned int idx,
+ void *data)
+{
+ struct xfs_init_zones *iz = data;
+ struct xfs_mount *mp = iz->mp;
+ xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start);
+ xfs_rgnumber_t rgno;
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+ xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+ return -EFSCORRUPTED;
+ }
+
+ rgno = xfs_rtb_to_rgno(mp, zsbno);
+ rtg = xfs_rtgroup_grab(mp, rgno);
+ if (!rtg) {
+ xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+ return -EFSCORRUPTED;
+ }
+ error = xfs_init_zone(iz, rtg, zone);
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+ struct xfs_mount *mp)
+{
+ unsigned int max_open, max_open_data_zones;
+ /*
+ * We need two zones for every open data zone,
+ * one in reserve as we don't reclaim open zones. One data zone
+ * and its spare is included in XFS_MIN_ZONES.
+ */
+ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+ max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+ /*
+ * Cap the max open limit to 1/4 of available space
+ */
+ max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+ return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+/*
+ * Normally we use the open zone limit that the device reports. If there is
+ * none let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no override,
+ * allow to hold about a quarter of the zones open. In theory we could allow
+ * all to be open, but at that point we run into GC deadlocks because we can't
+ * reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as we'll
+ * otherwise overwhelm the FTL just as much as a conventional block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to 1 here.
+ */
+static int
+xfs_calc_open_zones(
+ struct xfs_mount *mp)
+{
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ unsigned int bdev_open_zones = bdev_max_open_zones(bdev);
+
+ if (!mp->m_max_open_zones) {
+ if (bdev_open_zones)
+ mp->m_max_open_zones = bdev_open_zones;
+ else
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ }
+
+ if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+ xfs_notice(mp, "need at least %u open zones.",
+ XFS_MIN_OPEN_ZONES);
+ return -EIO;
+ }
+
+ if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+ mp->m_max_open_zones = bdev_open_zones;
+ xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+ bdev_open_zones);
+ }
+
+ if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+ mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+ }
+
+ return 0;
+}
+
+static unsigned long *
+xfs_alloc_bucket_bitmap(
+ struct xfs_mount *mp)
+{
+ return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount),
+ sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO);
+}
+
+static struct xfs_zone_info *
+xfs_alloc_zone_info(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi;
+ int i;
+
+ zi = kzalloc(sizeof(*zi), GFP_KERNEL);
+ if (!zi)
+ return NULL;
+ INIT_LIST_HEAD(&zi->zi_open_zones);
+ INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
+ spin_lock_init(&zi->zi_reset_list_lock);
+ spin_lock_init(&zi->zi_open_zones_lock);
+ spin_lock_init(&zi->zi_reservation_lock);
+ init_waitqueue_head(&zi->zi_zone_wait);
+ spin_lock_init(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp);
+ if (!zi->zi_used_bucket_bitmap[i])
+ goto out_free_bitmaps;
+ }
+ return zi;
+
+out_free_bitmaps:
+ while (--i > 0)
+ kvfree(zi->zi_used_bucket_bitmap[i]);
+ kfree(zi);
+ return NULL;
+}
+
+static void
+xfs_free_zone_info(
+ struct xfs_zone_info *zi)
+{
+ int i;
+
+ xfs_free_open_zones(zi);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
+ kvfree(zi->zi_used_bucket_bitmap[i]);
+ kfree(zi);
+}
+
+int
+xfs_mount_zones(
+ struct xfs_mount *mp)
+{
+ struct xfs_init_zones iz = {
+ .mp = mp,
+ };
+ struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ int error;
+
+ if (!bt) {
+ xfs_notice(mp, "RT device missing.");
+ return -EINVAL;
+ }
+
+ if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+ xfs_notice(mp, "invalid flag combination.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rextsize != 1) {
+ xfs_notice(mp, "zoned file systems do not support rextsize.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+ xfs_notice(mp,
+"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_calc_open_zones(mp);
+ if (error)
+ return error;
+
+ mp->m_zone_info = xfs_alloc_zone_info(mp);
+ if (!mp->m_zone_info)
+ return -ENOMEM;
+
+ xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+ mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+ mp->m_max_open_zones);
+ trace_xfs_zones_mount(mp);
+
+ if (bdev_is_zoned(bt->bt_bdev)) {
+ error = blkdev_report_zones(bt->bt_bdev,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
+ mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
+ if (error < 0)
+ goto out_free_zone_info;
+ } else {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_init_zone(&iz, rtg, NULL);
+ if (error)
+ goto out_free_zone_info;
+ }
+ }
+
+ xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ iz.available + iz.reclaimable);
+
+ error = xfs_zone_gc_mount(mp);
+ if (error)
+ goto out_free_zone_info;
+ return 0;
+
+out_free_zone_info:
+ xfs_free_zone_info(mp->m_zone_info);
+ return error;
+}
+
+void
+xfs_unmount_zones(
+ struct xfs_mount *mp)
+{
+ xfs_zone_gc_unmount(mp);
+ xfs_free_zone_info(mp->m_zone_info);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644
index 000000000000..ecf39106704c
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+struct iomap_ioend;
+struct xfs_open_zone;
+
+struct xfs_zone_alloc_ctx {
+ struct xfs_open_zone *open_zone;
+ xfs_filblks_t reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY (1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT (1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+ unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno, xfs_filblks_t len);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+ xfs_daddr_t daddr, struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock);
+void xfs_open_zone_put(struct xfs_open_zone *oz);
+
+void xfs_zoned_wake_all(struct xfs_mount *mp);
+bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
+void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
+
+#ifdef CONFIG_XFS_RT
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
+#else
+static inline int xfs_mount_zones(struct xfs_mount *mp)
+{
+ return -EIO;
+}
+static inline void xfs_unmount_zones(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644
index 000000000000..c5136ea9bb1d
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Implement Garbage Collection (GC) of partially used zoned.
+ *
+ * To support the purely sequential writes in each zone, zoned XFS needs to be
+ * able to move data remaining in a zone out of it to reset the zone to prepare
+ * for writing to it again.
+ *
+ * This is done by the GC thread implemented in this file. To support that a
+ * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
+ * write the garbage collected data into.
+ *
+ * Whenever the available space is below the chosen threshold, the GC thread
+ * looks for potential non-empty but not fully used zones that are worth
+ * reclaiming. Once found the rmap for the victim zone is queried, and after
+ * a bit of sorting to reduce fragmentation, the still live extents are read
+ * into memory and written to the GC target zone, and the bmap btree of the
+ * files is updated to point to the new location. To avoid taking the IOLOCK
+ * and MMAPLOCK for the entire GC process and thus affecting the latency of
+ * user reads and writes to the files, the GC writes are speculative and the
+ * I/O completion checks that no other writes happened for the affected regions
+ * before remapping.
+ *
+ * Once a zone does not contain any valid data, be that through GC or user
+ * block removal, it is queued for for a zone reset. The reset operation
+ * carefully ensures that the RT device cache is flushed and all transactions
+ * referencing the rmap have been committed to disk.
+ */
+
+/*
+ * Size of each GC scratch pad. This is also the upper bound for each
+ * GC I/O, which helps to keep latency down.
+ */
+#define XFS_GC_CHUNK_SIZE SZ_1M
+
+/*
+ * Scratchpad data to read GCed data into.
+ *
+ * The offset member tracks where the next allocation starts, and freed tracks
+ * the amount of space that is not used anymore.
+ */
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+ struct folio *folio;
+ unsigned int offset;
+ unsigned int freed;
+};
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+ struct xfs_zone_gc_data *data;
+
+ /*
+ * Entry into the reading/writing/resetting list. Only accessed from
+ * the GC thread, so no locking needed.
+ */
+ struct list_head entry;
+
+ /*
+ * State of this gc_bio. Done means the current I/O completed.
+ * Set from the bio end I/O handler, read from the GC thread.
+ */
+ enum {
+ XFS_GC_BIO_NEW,
+ XFS_GC_BIO_DONE,
+ } state;
+
+ /*
+ * Pointer to the inode and byte range in the inode that this
+ * GC chunk is operating on.
+ */
+ struct xfs_inode *ip;
+ loff_t offset;
+ unsigned int len;
+
+ /*
+ * Existing startblock (in the zone to be freed) and newly assigned
+ * daddr in the zone GCed into.
+ */
+ xfs_fsblock_t old_startblock;
+ xfs_daddr_t new_daddr;
+ struct xfs_zone_scratch *scratch;
+
+ /* Are we writing to a sequential write required zone? */
+ bool is_seq;
+
+ /* Open Zone being written to */
+ struct xfs_open_zone *oz;
+
+ /* Bio used for reads and writes, including the bvec used by it */
+ struct bio_vec bv;
+ struct bio bio; /* must be last */
+};
+
+#define XFS_ZONE_GC_RECS 1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+ struct xfs_rtgroup *victim_rtg;
+ unsigned int rec_count;
+ unsigned int rec_idx;
+ xfs_agblock_t next_startblock;
+ struct xfs_rmap_irec *recs;
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+ struct xfs_mount *mp;
+
+ /* bioset used to allocate the gc_bios */
+ struct bio_set bio_set;
+
+ /*
+ * Scratchpad used, and index to indicated which one is used.
+ */
+ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
+ unsigned int scratch_idx;
+
+ /*
+ * List of bios currently being read, written and reset.
+ * These lists are only accessed by the GC thread itself, and must only
+ * be processed in order.
+ */
+ struct list_head reading;
+ struct list_head writing;
+ struct list_head resetting;
+
+ /*
+ * Iterator for the victim zone.
+ */
+ struct xfs_zone_gc_iter iter;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
+{
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ return false;
+ if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+ mp->m_groups[XG_TYPE_RTG].blocks *
+ (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+ return false;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_gc_data *data;
+ int i;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+ data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
+ GFP_KERNEL);
+ if (!data->iter.recs)
+ goto out_free_data;
+
+ /*
+ * We actually only need a single bio_vec. It would be nice to have
+ * a flag that only allocates the inline bvecs and not the separate
+ * bvec pool.
+ */
+ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_recs;
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+ data->scratch[i].folio =
+ folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+ if (!data->scratch[i].folio)
+ goto out_free_scratch;
+ }
+ INIT_LIST_HEAD(&data->reading);
+ INIT_LIST_HEAD(&data->writing);
+ INIT_LIST_HEAD(&data->resetting);
+ data->mp = mp;
+ return data;
+
+out_free_scratch:
+ while (--i >= 0)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+out_free_recs:
+ kfree(data->iter.recs);
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+ struct xfs_zone_gc_data *data)
+{
+ int i;
+
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+ kfree(data->iter.recs);
+ kfree(data);
+}
+
+static void
+xfs_zone_gc_iter_init(
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rtgroup *victim_rtg)
+
+{
+ iter->next_startblock = 0;
+ iter->rec_count = 0;
+ iter->rec_idx = 0;
+ iter->victim_rtg = victim_rtg;
+}
+
+/*
+ * Query the rmap of the victim zone to gather the records to evacuate.
+ */
+static int
+xfs_zone_gc_query_cb(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec,
+ void *private)
+{
+ struct xfs_zone_gc_iter *iter = private;
+
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+ ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+ ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+ iter->recs[iter->rec_count] = *irec;
+ if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+ iter->next_startblock =
+ irec->rm_startblock + irec->rm_blockcount;
+ return 1;
+ }
+ return 0;
+}
+
+#define cmp_int(l, r) ((l > r) - (l < r))
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_rmap_irec *reca = a;
+ const struct xfs_rmap_irec *recb = b;
+ int diff;
+
+ diff = cmp_int(reca->rm_owner, recb->rm_owner);
+ if (diff)
+ return diff;
+ return cmp_int(reca->rm_offset, recb->rm_offset);
+}
+
+static int
+xfs_zone_gc_query(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *rtg = iter->victim_rtg;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_btree_cur *cur;
+ struct xfs_trans *tp;
+ int error;
+
+ ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+ if (iter->next_startblock == rtg_blocks(rtg))
+ goto done;
+
+ ASSERT(iter->next_startblock < rtg_blocks(rtg));
+ ri_low.rm_startblock = iter->next_startblock;
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+
+ iter->rec_idx = 0;
+ iter->rec_count = 0;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_zone_gc_query_cb, iter);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+ xfs_trans_cancel(tp);
+
+ if (error < 0)
+ return error;
+
+ /*
+ * Sort the rmap records by inode number and increasing offset to
+ * defragment the mappings.
+ *
+ * This could be further enhanced by an even bigger look ahead window,
+ * but that's better left until we have better detection of changes to
+ * inode mapping to avoid the potential of GCing already dead data.
+ */
+ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+ xfs_zone_gc_rmap_rec_cmp, NULL);
+
+ if (error == 0) {
+ /*
+ * We finished iterating through the zone.
+ */
+ iter->next_startblock = rtg_blocks(rtg);
+ if (iter->rec_count == 0)
+ goto done;
+ }
+
+ return 0;
+done:
+ xfs_rtgroup_rele(iter->victim_rtg);
+ iter->victim_rtg = NULL;
+ return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rmap_irec *chunk_rec,
+ struct xfs_inode **ipp)
+{
+ struct xfs_rmap_irec *irec;
+ int error;
+
+ if (!iter->victim_rtg)
+ return false;
+
+retry:
+ if (iter->rec_idx == iter->rec_count) {
+ error = xfs_zone_gc_query(mp, iter);
+ if (error)
+ goto fail;
+ if (!iter->victim_rtg)
+ return false;
+ }
+
+ irec = &iter->recs[iter->rec_idx];
+ error = xfs_iget(mp, NULL, irec->rm_owner,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+ if (error) {
+ /*
+ * If the inode was already deleted, skip over it.
+ */
+ if (error == -ENOENT) {
+ iter->rec_idx++;
+ goto retry;
+ }
+ goto fail;
+ }
+
+ if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
+ iter->rec_idx++;
+ xfs_irele(*ipp);
+ goto retry;
+ }
+
+ *chunk_rec = *irec;
+ return true;
+
+fail:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+ struct xfs_zone_gc_iter *iter,
+ xfs_extlen_t count_fsb)
+{
+ struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
+
+ irec->rm_offset += count_fsb;
+ irec->rm_startblock += count_fsb;
+ irec->rm_blockcount -= count_fsb;
+ if (!irec->rm_blockcount)
+ iter->rec_idx++;
+}
+
+static struct xfs_rtgroup *
+xfs_zone_gc_pick_victim_from(
+ struct xfs_mount *mp,
+ uint32_t bucket)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t victim_used = U32_MAX;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ uint32_t bit;
+
+ if (!zi->zi_used_bucket_entries[bucket])
+ return NULL;
+
+ for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
+ mp->m_sb.sb_rgcount) {
+ struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
+
+ if (!rtg)
+ continue;
+
+ /* skip zones that are just waiting for a reset */
+ if (rtg_rmap(rtg)->i_used_blocks == 0 ||
+ rtg_rmap(rtg)->i_used_blocks >= victim_used) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
+ if (victim_rtg)
+ xfs_rtgroup_rele(victim_rtg);
+ victim_rtg = rtg;
+ victim_used = rtg_rmap(rtg)->i_used_blocks;
+
+ /*
+ * Any zone that is less than 1 percent used is fair game for
+ * instant reclaim. All of these zones are in the last
+ * bucket, so avoid the expensive division for the zones
+ * in the other buckets.
+ */
+ if (bucket == 0 &&
+ rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
+ break;
+ }
+
+ return victim_rtg;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate to
+ * reclaim.
+ */
+static bool
+xfs_zone_gc_select_victim(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ unsigned int bucket;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (iter->victim_rtg)
+ return true;
+
+ /*
+ * Don't start new work if we are asked to stop or park.
+ */
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+
+ if (!xfs_zoned_need_gc(mp))
+ return false;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
+ victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
+ if (victim_rtg)
+ break;
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ if (!victim_rtg)
+ return false;
+
+ trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
+ xfs_zone_gc_iter_init(iter, victim_rtg);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_steal_open(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz, *found = NULL;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+ if (!found ||
+ oz->oz_write_pointer < found->oz_write_pointer)
+ found = oz;
+ }
+
+ if (found) {
+ found->oz_is_gc = true;
+ list_del_init(&found->oz_entry);
+ zi->zi_nr_open_zones--;
+ }
+
+ spin_unlock(&zi->zi_open_zones_lock);
+ return found;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_select_target(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = zi->zi_open_gc_zone;
+
+ /*
+ * We need to wait for pending writes to finish.
+ */
+ if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
+ return NULL;
+
+ ASSERT(zi->zi_nr_open_zones <=
+ mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+ oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
+ if (oz)
+ trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+ spin_lock(&zi->zi_open_zones_lock);
+ zi->zi_open_gc_zone = oz;
+ spin_unlock(&zi->zi_open_zones_lock);
+ return oz;
+}
+
+/*
+ * Ensure we have a valid open zone to write the GC data to.
+ *
+ * If the current target zone has space keep writing to it, else first wait for
+ * all pending writes and then pick a new one.
+ */
+static struct xfs_open_zone *
+xfs_zone_gc_ensure_target(
+ struct xfs_mount *mp)
+{
+ struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
+
+ if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ return xfs_zone_gc_select_target(mp);
+ return oz;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+ struct xfs_zone_gc_data *data)
+{
+ return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_open_zone *oz;
+
+ oz = xfs_zone_gc_ensure_target(data->mp);
+ if (!oz)
+ return false;
+ return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+ xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ struct xfs_zone_gc_data *data = chunk->data;
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+ wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_alloc_blocks(
+ struct xfs_zone_gc_data *data,
+ xfs_extlen_t *count_fsb,
+ xfs_daddr_t *daddr,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = data->mp;
+ struct xfs_open_zone *oz;
+
+ oz = xfs_zone_gc_ensure_target(mp);
+ if (!oz)
+ return NULL;
+
+ *count_fsb = min(*count_fsb,
+ XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ *
+ * If we'd take them from the normal pool we could be stealing blocks
+ * from a regular writer, which would then have to wait for GC and
+ * deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ *count_fsb = min(*count_fsb,
+ rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+ *count_fsb = min3(*count_fsb,
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail,
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (!*count_fsb)
+ return NULL;
+
+ *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+ if (!*is_seq)
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
+ oz->oz_write_pointer += *count_fsb;
+ atomic_inc(&oz->oz_ref);
+ return oz;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_open_zone *oz;
+ struct xfs_rmap_irec irec;
+ struct xfs_gc_bio *chunk;
+ struct xfs_inode *ip;
+ struct bio *bio;
+ xfs_daddr_t daddr;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+ return false;
+ oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+ &is_seq);
+ if (!oz) {
+ xfs_irele(ip);
+ return false;
+ }
+
+ bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->ip = ip;
+ chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+ chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+ chunk->old_startblock =
+ xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+ chunk->new_daddr = daddr;
+ chunk->is_seq = is_seq;
+ chunk->scratch = &data->scratch[data->scratch_idx];
+ chunk->data = data;
+ chunk->oz = oz;
+
+ bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+ bio->bi_end_io = xfs_zone_gc_end_io;
+ bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+ chunk->scratch->offset);
+ chunk->scratch->offset += chunk->len;
+ if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+ data->scratch_idx =
+ (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+ }
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->reading);
+ xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+ submit_bio(bio);
+ return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ list_del(&chunk->entry);
+ xfs_open_zone_put(chunk->oz);
+ xfs_irele(chunk->ip);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_zone_gc_submit_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ if (chunk->is_seq) {
+ chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+ chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+ chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+ submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_zone_gc_split_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ struct queue_limits *lim =
+ &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+ struct xfs_gc_bio *split_chunk;
+ int split_sectors;
+ unsigned int split_len;
+ struct bio *split;
+ unsigned int nsegs;
+
+ if (!chunk->is_seq)
+ return NULL;
+
+ split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+ if (!split_sectors)
+ return NULL;
+
+ /* ensure the split chunk is still block size aligned */
+ split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
+ data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
+ split_len = split_sectors << SECTOR_SHIFT;
+
+ split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+ split_chunk = container_of(split, struct xfs_gc_bio, bio);
+ split_chunk->data = data;
+ ihold(VFS_I(chunk->ip));
+ split_chunk->ip = chunk->ip;
+ split_chunk->is_seq = chunk->is_seq;
+ split_chunk->scratch = chunk->scratch;
+ split_chunk->offset = chunk->offset;
+ split_chunk->len = split_len;
+ split_chunk->old_startblock = chunk->old_startblock;
+ split_chunk->new_daddr = chunk->new_daddr;
+ split_chunk->oz = chunk->oz;
+ atomic_inc(&chunk->oz->oz_ref);
+
+ chunk->offset += split_len;
+ chunk->len -= split_len;
+ chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+ /* add right before the original chunk */
+ WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&split_chunk->entry, &chunk->entry);
+ return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_zone_gc_data *data = chunk->data;
+ struct xfs_mount *mp = chunk->ip->i_mount;
+ unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
+ struct xfs_gc_bio *split_chunk;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_move_tail(&chunk->entry, &data->writing);
+
+ bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+ bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
+ folio_offset);
+
+ while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
+ xfs_zone_gc_submit_write(data, split_chunk);
+ xfs_zone_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_inode *ip = chunk->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ chunk->scratch->freed += chunk->len;
+ if (chunk->scratch->freed == chunk->scratch->offset) {
+ chunk->scratch->offset = 0;
+ chunk->scratch->freed = 0;
+ }
+
+ /*
+ * Cycle through the iolock and wait for direct I/O and layouts to
+ * ensure no one is reading from the old mapping before it goes away.
+ *
+ * Note that xfs_zoned_end_io() below checks that no other writer raced
+ * with us to update the mapping by checking that the old startblock
+ * didn't change.
+ */
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+ if (!error)
+ inode_dio_wait(VFS_I(ip));
+ xfs_iunlock(ip, iolock);
+ if (error)
+ goto free;
+
+ if (chunk->is_seq)
+ chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+ error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+ chunk->new_daddr, chunk->oz, chunk->old_startblock);
+free:
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_rtgroup *rtg = chunk->bio.bi_private;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ if (chunk->bio.bi_status) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out;
+ }
+
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ atomic_inc(&zi->zi_nr_free_zones);
+
+ xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+ wake_up_all(&zi->zi_zone_wait);
+out:
+ list_del(&chunk->entry);
+ bio_put(&chunk->bio);
+}
+
+static bool
+xfs_zone_gc_prepare_reset(
+ struct bio *bio,
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_zone_reset(rtg);
+
+ ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+ bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+ if (!bdev_max_discard_sectors(bio->bi_bdev))
+ return false;
+ bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+ bio->bi_iter.bi_size =
+ XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
+ }
+
+ return true;
+}
+
+int
+xfs_zone_gc_reset_sync(
+ struct xfs_rtgroup *rtg)
+{
+ int error = 0;
+ struct bio bio;
+
+ bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+ REQ_OP_ZONE_RESET);
+ if (xfs_zone_gc_prepare_reset(&bio, rtg))
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ return error;
+}
+
+static void
+xfs_zone_gc_reset_zones(
+ struct xfs_zone_gc_data *data,
+ struct xfs_group *reset_list)
+{
+ struct xfs_group *next = reset_list;
+
+ if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+ xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ do {
+ struct xfs_rtgroup *rtg = to_rtg(next);
+ struct xfs_gc_bio *chunk;
+ struct bio *bio;
+
+ xfs_log_force_inode(rtg_rmap(rtg));
+
+ next = rtg_group(rtg)->xg_next_reset;
+ rtg_group(rtg)->xg_next_reset = NULL;
+
+ bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+ 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+ bio->bi_private = rtg;
+ bio->bi_end_io = xfs_zone_gc_end_io;
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->data = data;
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->resetting);
+
+ /*
+ * Also use the bio to drive the state machine when neither
+ * zone reset nor discard is supported to keep things simple.
+ */
+ if (xfs_zone_gc_prepare_reset(bio, rtg))
+ submit_bio(bio);
+ else
+ bio_endio(bio);
+ } while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static bool
+xfs_zone_gc_handle_work(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+ struct xfs_gc_bio *chunk, *next;
+ struct xfs_group *reset_list;
+ struct blk_plug plug;
+
+ spin_lock(&zi->zi_reset_list_lock);
+ reset_list = zi->zi_reset_list;
+ zi->zi_reset_list = NULL;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (!xfs_zone_gc_select_victim(data) ||
+ !xfs_zone_gc_space_available(data)) {
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !reset_list)
+ return false;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ try_to_freeze();
+
+ if (reset_list)
+ xfs_zone_gc_reset_zones(data, reset_list);
+
+ list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_reset(chunk);
+ }
+
+ list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_chunk(chunk);
+ }
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_write_chunk(chunk);
+ }
+ blk_finish_plug(&plug);
+
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ return true;
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before. Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+ void *private)
+{
+ struct xfs_zone_gc_data *data = private;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+ xfs_set_zonegc_running(mp);
+ if (xfs_zone_gc_handle_work(data))
+ continue;
+
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !zi->zi_reset_list) {
+ xfs_clear_zonegc_running(mp);
+ xfs_zoned_resv_wake_all(mp);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ kthread_parkme();
+ continue;
+ }
+ }
+
+ schedule();
+ }
+ xfs_clear_zonegc_running(mp);
+
+ if (data->iter.victim_rtg)
+ xfs_rtgroup_rele(data->iter.victim_rtg);
+
+ memalloc_nofs_restore(nofs_flag);
+ xfs_zone_gc_data_free(data);
+ return 0;
+}
+
+void
+xfs_zone_gc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+int
+xfs_zone_gc_mount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_gc_data *data;
+ struct xfs_open_zone *oz;
+ int error;
+
+ /*
+ * If there are no free zones available for GC, pick the open zone with
+ * the least used space to GC into. This should only happen after an
+ * unclean shutdown near ENOSPC while GC was ongoing.
+ *
+ * We also need to do this for the first gc zone allocation if we
+ * unmounted while at the open limit.
+ */
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
+ zi->zi_nr_open_zones == mp->m_max_open_zones)
+ oz = xfs_zone_gc_steal_open(zi);
+ else
+ oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
+ if (!oz) {
+ xfs_warn(mp, "unable to allocate a zone for gc");
+ error = -EIO;
+ goto out;
+ }
+
+ trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+ zi->zi_open_gc_zone = oz;
+
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data) {
+ error = -ENOMEM;
+ goto out_put_gc_zone;
+ }
+
+ mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+ "xfs-zone-gc/%s", mp->m_super->s_id);
+ if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+ xfs_warn(mp, "unable to create zone gc thread");
+ error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
+ goto out_free_gc_data;
+ }
+
+ /* xfs_zone_gc_start will unpark for rw mounts */
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+ return 0;
+
+out_free_gc_data:
+ kfree(data);
+out_put_gc_zone:
+ xfs_open_zone_put(zi->zi_open_gc_zone);
+out:
+ return error;
+}
+
+void
+xfs_zone_gc_unmount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ kthread_stop(zi->zi_gc_thread);
+ if (zi->zi_open_gc_zone)
+ xfs_open_zone_put(zi->zi_open_gc_zone);
+}
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
new file mode 100644
index 000000000000..733bcc2f8645
--- /dev/null
+++ b/fs/xfs/xfs_zone_info.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+
+static const char xfs_write_hint_shorthand[6][16] = {
+ "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
+
+static inline const char *
+xfs_write_hint_to_str(
+ uint8_t write_hint)
+{
+ if (write_hint > WRITE_LIFE_EXTREME)
+ return "UNKNOWN";
+ return xfs_write_hint_shorthand[write_hint];
+}
+
+static void
+xfs_show_open_zone(
+ struct seq_file *m,
+ struct xfs_open_zone *oz)
+{
+ seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
+ rtg_rgno(oz->oz_rtg),
+ oz->oz_write_pointer, oz->oz_written,
+ rtg_rmap(oz->oz_rtg)->i_used_blocks,
+ xfs_write_hint_to_str(oz->oz_write_hint));
+}
+
+static void
+xfs_show_full_zone_used_distribution(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int reclaimable = 0, full, i;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ unsigned int entries = zi->zi_used_bucket_entries[i];
+
+ seq_printf(m, "\t %2u..%2u%%: %u\n",
+ i * (100 / XFS_ZONE_USED_BUCKETS),
+ (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1,
+ entries);
+ reclaimable += entries;
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ full = mp->m_sb.sb_rgcount;
+ if (zi->zi_open_gc_zone)
+ full--;
+ full -= zi->zi_nr_open_zones;
+ full -= atomic_read(&zi->zi_nr_free_zones);
+ full -= reclaimable;
+
+ seq_printf(m, "\t 100%%: %u\n", full);
+}
+
+void
+xfs_zoned_show_stats(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ seq_puts(m, "\n");
+
+ seq_printf(m, "\tuser free RT blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ seq_printf(m, "\treserved free RT blocks: %lld\n",
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail);
+ seq_printf(m, "\tuser available RT blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
+ seq_printf(m, "\treserved available RT blocks: %lld\n",
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+ seq_printf(m, "\tRT reservations required: %d\n",
+ !list_empty_careful(&zi->zi_reclaim_reservations));
+ seq_printf(m, "\tRT GC required: %d\n",
+ xfs_zoned_need_gc(mp));
+
+ seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
+ seq_puts(m, "\topen zones:\n");
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ xfs_show_open_zone(m, oz);
+ if (zi->zi_open_gc_zone) {
+ seq_puts(m, "\topen gc zone:\n");
+ xfs_show_open_zone(m, zi->zi_open_gc_zone);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+ seq_puts(m, "\tused blocks distribution (fully written zones):\n");
+ xfs_show_full_zone_used_distribution(m, mp);
+}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
new file mode 100644
index 000000000000..ab696975a993
--- /dev/null
+++ b/fs/xfs/xfs_zone_priv.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_PRIV_H
+#define _XFS_ZONE_PRIV_H
+
+struct xfs_open_zone {
+ /*
+ * Entry in the open zone list and refcount. Protected by
+ * zi_open_zones_lock in struct xfs_zone_info.
+ */
+ struct list_head oz_entry;
+ atomic_t oz_ref;
+
+ /*
+ * oz_write_pointer is the write pointer at which space is handed out
+ * for conventional zones, or simple the count of blocks handed out
+ * so far for sequential write required zones and is protected by
+ * oz_alloc_lock/
+ */
+ spinlock_t oz_alloc_lock;
+ xfs_rgblock_t oz_write_pointer;
+
+ /*
+ * oz_written is the number of blocks for which we've received a
+ * write completion. oz_written must always be <= oz_write_pointer
+ * and is protected by the ILOCK of the rmap inode.
+ */
+ xfs_rgblock_t oz_written;
+
+ /*
+ * Write hint (data temperature) assigned to this zone, or
+ * WRITE_LIFE_NOT_SET if none was set.
+ */
+ enum rw_hint oz_write_hint;
+
+ /*
+ * Is this open zone used for garbage collection? There can only be a
+ * single open GC zone, which is pointed to by zi_open_gc_zone in
+ * struct xfs_zone_info. Constant over the life time of an open zone.
+ */
+ bool oz_is_gc;
+
+ /*
+ * Pointer to the RT groups structure for this open zone. Constant over
+ * the life time of an open zone.
+ */
+ struct xfs_rtgroup *oz_rtg;
+};
+
+/*
+ * Number of bitmap buckets to track reclaimable zones. There are 10 buckets
+ * so that each 10% of the usable capacity get their own bucket and GC can
+ * only has to walk the bitmaps of the lesser used zones if there are any.
+ */
+#define XFS_ZONE_USED_BUCKETS 10u
+
+struct xfs_zone_info {
+ /*
+ * List of pending space reservations:
+ */
+ spinlock_t zi_reservation_lock;
+ struct list_head zi_reclaim_reservations;
+
+ /*
+ * List and number of open zones:
+ */
+ spinlock_t zi_open_zones_lock;
+ struct list_head zi_open_zones;
+ unsigned int zi_nr_open_zones;
+
+ /*
+ * Free zone search cursor and number of free zones:
+ */
+ unsigned long zi_free_zone_cursor;
+ atomic_t zi_nr_free_zones;
+
+ /*
+ * Wait queue to wait for free zones or open zone resources to become
+ * available:
+ */
+ wait_queue_head_t zi_zone_wait;
+
+ /*
+ * Pointer to the GC thread, and the current open zone used by GC
+ * (if any).
+ *
+ * zi_open_gc_zone is mostly private to the GC thread, but can be read
+ * for debugging from other threads, in which case zi_open_zones_lock
+ * must be taken to access it.
+ */
+ struct task_struct *zi_gc_thread;
+ struct xfs_open_zone *zi_open_gc_zone;
+
+ /*
+ * List of zones that need a reset:
+ */
+ spinlock_t zi_reset_list_lock;
+ struct xfs_group *zi_reset_list;
+
+ /*
+ * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
+ * garbage collection to quickly find the best candidate for reclaim.
+ */
+ spinlock_t zi_used_buckets_lock;
+ unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
+ unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
+
+};
+
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
+ enum rw_hint write_hint, bool is_gc);
+
+int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+int xfs_zone_gc_mount(struct xfs_mount *mp);
+void xfs_zone_gc_unmount(struct xfs_mount *mp);
+
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
+#endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
new file mode 100644
index 000000000000..93c9a7721139
--- /dev/null
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+ struct list_head entry;
+ struct task_struct *task;
+ xfs_filblks_t count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks. This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ switch (ctr) {
+ case XC_FREE_RTEXTENTS:
+ return (uint64_t)XFS_RESERVED_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks +
+ mp->m_sb.sb_rtreserved;
+ case XC_FREE_RTAVAILABLE:
+ return (uint64_t)XFS_GC_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+void
+xfs_zoned_resv_wake_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ return;
+ }
+
+ spin_lock(&zi->zi_reservation_lock);
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+ if (reservation->count_fsb > count_fsb)
+ break;
+ wake_up_process(reservation->task);
+ count_fsb -= reservation->count_fsb;
+
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .count_fsb = count_fsb,
+ };
+ int error;
+
+ /*
+ * If there are no waiters, try to directly grab the available blocks
+ * from the percpu counter.
+ *
+ * If the caller wants to dip into the reserved pool also bypass the
+ * wait list. This relies on the fact that we have a very graciously
+ * sized reserved pool that always has enough space. If the reserved
+ * allocations fail we're in trouble.
+ */
+ if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+ (flags & XFS_ZR_RESERVED))) {
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ return error;
+ }
+
+ if (flags & XFS_ZR_NOWAIT)
+ return -EAGAIN;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+ while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+ set_current_state(TASK_KILLABLE);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ break;
+
+ /*
+ * Make sure to start GC if it is not running already. As we
+ * check the rtavailable count when filling up zones, GC is
+ * normally already running at this point, but in some setups
+ * with very few zones we may completely run out of non-
+ * reserved blocks in between filling zones.
+ */
+ if (!xfs_is_zonegc_running(mp))
+ wake_up_process(zi->zi_gc_thread);
+
+ /*
+ * If there is no reclaimable group left and we aren't still
+ * processing a pending GC request give up as we're fully out
+ * of space.
+ */
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
+ !xfs_is_zonegc_running(mp))
+ break;
+
+ spin_unlock(&zi->zi_reservation_lock);
+ schedule();
+ spin_lock(&zi->zi_reservation_lock);
+ }
+ list_del(&reservation.entry);
+ spin_unlock(&zi->zi_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+ struct xfs_inode *ip,
+ xfs_filblks_t *count_fsb,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ s64 len = *count_fsb;
+ int error = -ENOSPC;
+
+ spin_lock(&zi->zi_reservation_lock);
+ len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ if (len > 0) {
+ *count_fsb = len;
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+ return error;
+}
+
+int
+xfs_zoned_space_reserve(
+ struct xfs_inode *ip,
+ xfs_filblks_t count_fsb,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ ASSERT(ac->reserved_blocks == 0);
+ ASSERT(ac->open_zone == NULL);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+ error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+ if (error)
+ return error;
+
+ error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+ if (error) {
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+ return error;
+ }
+ ac->reserved_blocks = count_fsb;
+ return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+ struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (ac->reserved_blocks > 0) {
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+ }
+ if (ac->open_zone)
+ xfs_open_zone_put(ac->open_zone);
+}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 35166c92420c..42e2c0065bb3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);