summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorMike Marshall <hubcap@omnibond.com>2017-02-25 11:12:48 -0500
committerMike Marshall <hubcap@omnibond.com>2017-02-25 11:12:48 -0500
commite98bdb3059cbf2b1cd4261e126b08429f64466c3 (patch)
treee378fc95b495cc6e0e558f247e99bcaa21a6d567 /fs
parenteb68d0324dc4d88ab0d6159bdcd98c247a3a8954 (diff)
parentc470abd4fde40ea6a0846a2beab642a578c0b8cd (diff)
downloadlwn-e98bdb3059cbf2b1cd4261e126b08429f64466c3.tar.gz
lwn-e98bdb3059cbf2b1cd4261e126b08429f64466c3.zip
Merge tag 'v4.10' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into for-next
Linux 4.10
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c16
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_inode.c1
-rw-r--r--fs/9p/vfs_inode_dotl.c1
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/symlink.c1
-rw-r--r--fs/afs/proc.c2
-rw-r--r--fs/aio.c115
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/autofs4/autofs_i.h5
-rw-r--r--fs/autofs4/dev-ioctl.c10
-rw-r--r--fs/autofs4/expire.c25
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/autofs4/root.c61
-rw-r--r--fs/autofs4/symlink.c1
-rw-r--r--fs/autofs4/waitq.c13
-rw-r--r--fs/bad_inode.c55
-rw-r--r--fs/befs/befs.h3
-rw-r--r--fs/befs/befs_fs_types.h12
-rw-r--r--fs/befs/btree.c48
-rw-r--r--fs/befs/btree.h8
-rw-r--r--fs/befs/datastream.c8
-rw-r--r--fs/befs/datastream.h5
-rw-r--r--fs/befs/debug.c14
-rw-r--r--fs/befs/inode.c12
-rw-r--r--fs/befs/inode.h5
-rw-r--r--fs/befs/io.c7
-rw-r--r--fs/befs/io.h1
-rw-r--r--fs/befs/linuxvfs.c134
-rw-r--r--fs/befs/super.h4
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c9
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/block_dev.c276
-rw-r--r--fs/btrfs/async-thread.c29
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/check-integrity.c103
-rw-r--r--fs/btrfs/check-integrity.h5
-rw-r--r--fs/btrfs/compression.c173
-rw-r--r--fs/btrfs/compression.h12
-rw-r--r--fs/btrfs/ctree.c495
-rw-r--r--fs/btrfs/ctree.h244
-rw-r--r--fs/btrfs/delayed-inode.c147
-rw-r--r--fs/btrfs/delayed-inode.h21
-rw-r--r--fs/btrfs/delayed-ref.c20
-rw-r--r--fs/btrfs/delayed-ref.h14
-rw-r--r--fs/btrfs/dev-replace.c68
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/dir-item.c45
-rw-r--r--fs/btrfs/disk-io.c603
-rw-r--r--fs/btrfs/disk-io.h34
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c1559
-rw-r--r--fs/btrfs/extent_io.c128
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/file-item.c207
-rw-r--r--fs/btrfs/file.c250
-rw-r--r--fs/btrfs/free-space-cache.c164
-rw-r--r--fs/btrfs/free-space-cache.h12
-rw-r--r--fs/btrfs/free-space-tree.c44
-rw-r--r--fs/btrfs/inode-item.c11
-rw-r--r--fs/btrfs/inode-map.c22
-rw-r--r--fs/btrfs/inode.c959
-rw-r--r--fs/btrfs/ioctl.c623
-rw-r--r--fs/btrfs/lzo.c17
-rw-r--r--fs/btrfs/ordered-data.c38
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/print-tree.c19
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/props.c5
-rw-r--r--fs/btrfs/qgroup.c299
-rw-r--r--fs/btrfs/qgroup.h64
-rw-r--r--fs/btrfs/raid56.c78
-rw-r--r--fs/btrfs/raid56.h8
-rw-r--r--fs/btrfs/reada.c62
-rw-r--r--fs/btrfs/relocation.c453
-rw-r--r--fs/btrfs/root-tree.c28
-rw-r--r--fs/btrfs/scrub.c183
-rw-r--r--fs/btrfs/send.c33
-rw-r--r--fs/btrfs/super.c162
-rw-r--r--fs/btrfs/tests/btrfs-tests.c14
-rw-r--r--fs/btrfs/tests/btrfs-tests.h4
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c7
-rw-r--r--fs/btrfs/tests/extent-io-tests.c7
-rw-r--r--fs/btrfs/tests/free-space-tests.c18
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c9
-rw-r--r--fs/btrfs/tests/inode-tests.c16
-rw-r--r--fs/btrfs/tests/qgroup-tests.c11
-rw-r--r--fs/btrfs/transaction.c615
-rw-r--r--fs/btrfs/transaction.h29
-rw-r--r--fs/btrfs/tree-log.c215
-rw-r--r--fs/btrfs/uuid-tree.c27
-rw-r--r--fs/btrfs/volumes.c849
-rw-r--r--fs/btrfs/volumes.h72
-rw-r--r--fs/btrfs/xattr.c21
-rw-r--r--fs/btrfs/zlib.c16
-rw-r--r--fs/buffer.c130
-rw-r--r--fs/ceph/addr.c112
-rw-r--r--fs/ceph/caps.c328
-rw-r--r--fs/ceph/dir.c56
-rw-r--r--fs/ceph/export.c26
-rw-r--r--fs/ceph/file.c131
-rw-r--r--fs/ceph/inode.c26
-rw-r--r--fs/ceph/mds_client.c37
-rw-r--r--fs/ceph/mdsmap.c163
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c12
-rw-r--r--fs/ceph/super.h15
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifsencrypt.c14
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/cifs/connect.c85
-rw-r--r--fs/cifs/dir.c4
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/ioctl.c2
-rw-r--r--fs/cifs/link.c9
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2pdu.c87
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2proto.h1
-rw-r--r--fs/cifs/smbencrypt.c40
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/coda/cnode.c1
-rw-r--r--fs/compat.c83
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/configfs/symlink.c1
-rw-r--r--fs/coredump.c20
-rw-r--r--fs/crypto/Kconfig2
-rw-r--r--fs/crypto/crypto.c123
-rw-r--r--fs/crypto/fname.c8
-rw-r--r--fs/crypto/fscrypt_private.h93
-rw-r--r--fs/crypto/keyinfo.c11
-rw-r--r--fs/crypto/policy.c41
-rw-r--r--fs/dax.c1489
-rw-r--r--fs/dcache.c49
-rw-r--r--fs/dcookies.c6
-rw-r--r--fs/direct-io.c35
-rw-r--r--fs/dlm/ast.c2
-rw-r--r--fs/dlm/config.c2
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c28
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/netlink.c18
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ecryptfs/inode.c30
-rw-r--r--fs/efs/efs.h2
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c37
-rw-r--r--fs/exofs/inode.c68
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/file.c35
-rw-r--r--fs/ext2/inode.c27
-rw-r--r--fs/ext2/ioctl.c2
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/ext4.h31
-rw-r--r--fs/ext4/ext4_jbd2.h14
-rw-r--r--fs/ext4/extents.c42
-rw-r--r--fs/ext4/file.c224
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/inline.c18
-rw-r--r--fs/ext4/inode.c367
-rw-r--r--fs/ext4/ioctl.c84
-rw-r--r--fs/ext4/mballoc.c4
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/namei.c24
-rw-r--r--fs/ext4/page-io.c7
-rw-r--r--fs/ext4/super.c160
-rw-r--r--fs/ext4/symlink.c3
-rw-r--r--fs/ext4/xattr.c45
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c38
-rw-r--r--fs/f2fs/data.c212
-rw-r--r--fs/f2fs/debug.c29
-rw-r--r--fs/f2fs/dir.c30
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h203
-rw-r--r--fs/f2fs/file.c86
-rw-r--r--fs/f2fs/gc.c35
-rw-r--r--fs/f2fs/inline.c16
-rw-r--r--fs/f2fs/inode.c47
-rw-r--r--fs/f2fs/namei.c8
-rw-r--r--fs/f2fs/node.c230
-rw-r--r--fs/f2fs/node.h13
-rw-r--r--fs/f2fs/recovery.c46
-rw-r--r--fs/f2fs/segment.c240
-rw-r--r--fs/f2fs/segment.h28
-rw-r--r--fs/f2fs/shrinker.c10
-rw-r--r--fs/f2fs/super.c283
-rw-r--r--fs/f2fs/xattr.c4
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/fscache/cookie.c5
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object.c32
-rw-r--r--fs/fuse/dev.c8
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/dir.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c4
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/internal.h5
-rw-r--r--fs/ioctl.c6
-rw-r--r--fs/iomap.c381
-rw-r--r--fs/isofs/compress.c1
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c9
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jfs/ioctl.c4
-rw-r--r--fs/jfs/jfs_debug.c2
-rw-r--r--fs/jfs/jfs_logmgr.c4
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/symlink.c2
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/symlink.c1
-rw-r--r--fs/libfs.c20
-rw-r--r--fs/lockd/netns.h2
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c322
-rw-r--r--fs/logfs/dev_mtd.c274
-rw-r--r--fs/logfs/dir.c801
-rw-r--r--fs/logfs/file.c285
-rw-r--r--fs/logfs/gc.c732
-rw-r--r--fs/logfs/inode.c428
-rw-r--r--fs/logfs/journal.c894
-rw-r--r--fs/logfs/logfs.h735
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2298
-rw-r--r--fs/logfs/segment.c961
-rw-r--r--fs/logfs/super.c653
-rw-r--r--fs/mbcache.c41
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/mount.h6
-rw-r--r--fs/mpage.c9
-rw-r--r--fs/namei.c207
-rw-r--r--fs/namespace.c111
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/ncpfs/sock.c2
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/callback_proc.c99
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/dir.c96
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c13
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c322
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h23
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c153
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/inode.c112
-rw-r--r--fs/nfs/internal.h10
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs3client.c5
-rw-r--r--fs/nfs/nfs42proc.c9
-rw-r--r--fs/nfs/nfs42xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4client.c32
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c305
-rw-r--r--fs/nfs/nfs4session.c2
-rw-r--r--fs/nfs/nfs4state.c82
-rw-r--r--fs/nfs/nfs4xdr.c131
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/objlayout/objlayout.h1
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c425
-rw-r--r--fs/nfs/pnfs.h77
-rw-r--r--fs/nfs/pnfs_nfs.c28
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/symlink.c1
-rw-r--r--fs/nfs/write.c5
-rw-r--r--fs/nfs_common/grace.c2
-rw-r--r--fs/nfsd/fault_inject.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4layouts.c13
-rw-r--r--fs/nfsd/nfs4proc.c46
-rw-r--r--fs/nfsd/nfs4state.c19
-rw-r--r--fs/nfsd/nfs4xdr.c89
-rw-r--r--fs/nfsd/nfscache.c11
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h36
-rw-r--r--fs/nfsd/nfssvc.c4
-rw-r--r--fs/nfsd/state.h4
-rw-r--r--fs/nfsd/vfs.c11
-rw-r--r--fs/nilfs2/namei.c1
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/dnotify/dnotify.c2
-rw-r--r--fs/notify/fanotify/fanotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.h2
-rw-r--r--fs/notify/fsnotify.c8
-rw-r--r--fs/notify/inode_mark.c45
-rw-r--r--fs/notify/inotify/inotify.h2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c4
-rw-r--r--fs/notify/mark.c12
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/aops.c3
-rw-r--r--fs/ntfs/file.c7
-rw-r--r--fs/ntfs/logfile.c1
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ocfs2/alloc.c7
-rw-r--r--fs/ocfs2/aops.c40
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/cluster/masklog.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c10
-rw-r--r--fs/ocfs2/file.c42
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/mmap.c3
-rw-r--r--fs/ocfs2/move_extents.c10
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/quota_global.c10
-rw-r--r--fs/ocfs2/quota_local.c11
-rw-r--r--fs/ocfs2/refcounttree.c465
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h3
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/ocfs2/xattr.c4
-rw-r--r--fs/open.c2
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/devorangefs-req.c13
-rw-r--r--fs/orangefs/file.c2
-rw-r--r--fs/orangefs/inode.c1
-rw-r--r--fs/orangefs/orangefs-debugfs.c6
-rw-r--r--fs/orangefs/symlink.c1
-rw-r--r--fs/overlayfs/Kconfig14
-rw-r--r--fs/overlayfs/Makefile2
-rw-r--r--fs/overlayfs/copy_up.c63
-rw-r--r--fs/overlayfs/dir.c375
-rw-r--r--fs/overlayfs/inode.c79
-rw-r--r--fs/overlayfs/namei.c410
-rw-r--r--fs/overlayfs/overlayfs.h62
-rw-r--r--fs/overlayfs/ovl_entry.h53
-rw-r--r--fs/overlayfs/super.c557
-rw-r--r--fs/overlayfs/util.c265
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/pnode.c74
-rw-r--r--fs/posix_acl.c9
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c60
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/inode.c40
-rw-r--r--fs/proc/internal.h6
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/self.c13
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc/thread_self.c14
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/ftrace.c11
-rw-r--r--fs/pstore/inode.c15
-rw-r--r--fs/pstore/internal.h34
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/ram.c327
-rw-r--r--fs/pstore/ram_core.c27
-rw-r--r--fs/quota/dquot.c138
-rw-r--r--fs/quota/netlink.c10
-rw-r--r--fs/quota/quota.c26
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c260
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/reiserfs/namei.c1
-rw-r--r--fs/reiserfs/stree.c1
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c23
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c9
-rw-r--r--fs/splice.c11
-rw-r--r--fs/squashfs/block.c1
-rw-r--r--fs/squashfs/symlink.c1
-rw-r--r--fs/stat.c10
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c81
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/timerfd.c26
-rw-r--r--fs/ubifs/Kconfig11
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/crypto.c97
-rw-r--r--fs/ubifs/debug.c14
-rw-r--r--fs/ubifs/dir.c426
-rw-r--r--fs/ubifs/file.c109
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/io.c18
-rw-r--r--fs/ubifs/ioctl.c23
-rw-r--r--fs/ubifs/journal.c226
-rw-r--r--fs/ubifs/key.h21
-rw-r--r--fs/ubifs/replay.c10
-rw-r--r--fs/ubifs/sb.c59
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/tnc.c184
-rw-r--r--fs/ubifs/ubifs-media.h29
-rw-r--r--fs/ubifs/ubifs.h115
-rw-r--r--fs/ubifs/xattr.c116
-rw-r--r--fs/udf/dir.c1
-rw-r--r--fs/udf/directory.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/ufs/balloc.c4
-rw-r--r--fs/ufs/inode.c7
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/userfaultfd.c59
-rw-r--r--fs/utimes.c4
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c73
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c125
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c387
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_btree.h43
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h26
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c41
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h13
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c26
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c18
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c94
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c22
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c77
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c1
-rw-r--r--fs/xfs/libxfs/xfs_sb.c15
-rw-r--r--fs/xfs/libxfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_aops.c328
-rw-r--r--fs/xfs/xfs_aops.h9
-rw-r--r--fs/xfs/xfs_attr.h4
-rw-r--r--fs/xfs/xfs_attr_list.c59
-rw-r--r--fs/xfs/xfs_bmap_util.c73
-rw-r--r--fs/xfs/xfs_buf.c126
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_dquot.c4
-rw-r--r--fs/xfs/xfs_file.c293
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_icache.c43
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c107
-rw-r--r--fs/xfs/xfs_inode.h18
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c12
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c106
-rw-r--r--fs/xfs/xfs_iops.c66
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_log.c53
-rw-r--r--fs/xfs/xfs_log_recover.c16
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_mount.h8
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_pnfs.h4
-rw-r--r--fs/xfs/xfs_qm.c5
-rw-r--r--fs/xfs/xfs_refcount_item.c3
-rw-r--r--fs/xfs/xfs_reflink.c396
-rw-r--r--fs/xfs/xfs_reflink.h6
-rw-r--r--fs/xfs/xfs_stats.c10
-rw-r--r--fs/xfs/xfs_stats.h200
-rw-r--r--fs/xfs/xfs_super.c27
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_sysfs.c4
-rw-r--r--fs/xfs/xfs_trace.h109
-rw-r--r--fs/xfs/xfs_xattr.c23
538 files changed, 16359 insertions, 21240 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad79e1a5..adaf6f6dd858 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -34,6 +34,7 @@
#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/bvec.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -309,18 +310,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- if (unlikely(copied < len)) {
- /*
- * zero out the rest of the area
- */
- unsigned from = pos & (PAGE_SIZE - 1);
-
- zero_user(page, from + copied, len - copied);
- flush_dcache_page(page);
+ if (unlikely(copied < len && !PageUptodate(page))) {
+ copied = 0;
+ goto out;
}
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
@@ -330,6 +323,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
i_size_write(inode, last_pos);
}
set_page_dirty(page);
+out:
unlock_page(page);
put_page(page);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d7b78d531e63..6a0f3fa85ef7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -34,7 +34,7 @@
#include <linux/list.h>
#include <linux/pagemap.h>
#include <linux/utsname.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/idr.h>
#include <linux/uio.h>
#include <linux/slab.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 30ca770c5e0b..f4f4450119e4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1464,7 +1464,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
};
static const struct inode_operations v9fs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = v9fs_vfs_get_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index afaa4b6de801..5999bd050678 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -979,7 +979,6 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
};
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
- .readlink = generic_readlink,
.get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..83eab52fb3f6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
+ select FS_IOMAP
help
Direct Access (DAX) can be used on memory-backed block devices.
If the block device supports DAX and the filesystem supports DAX,
@@ -55,7 +56,6 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
- depends on BROKEN
endif # BLOCK
@@ -235,7 +235,6 @@ source "fs/efs/Kconfig"
source "fs/jffs2/Kconfig"
# UBIFS File system configuration
source "fs/ubifs/Kconfig"
-source "fs/logfs/Kconfig"
source "fs/cramfs/Kconfig"
source "fs/squashfs/Kconfig"
source "fs/freevxfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4c09d93d9569..b2f82cf6bf86 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -170,8 +170,8 @@ config BINFMT_MISC
You can do other nice things, too. Read the file
<file:Documentation/binfmt_misc.txt> to learn how to use this
- feature, <file:Documentation/java.txt> for information about how
- to include Java support. and <file:Documentation/mono.txt> for
+ feature, <file:Documentation/admin-guide/java.rst> for information about how
+ to include Java support. and <file:Documentation/admin-guide/mono.rst> for
information about how to include Mono-based .NET support.
To use binfmt_misc, you will need to mount it:
diff --git a/fs/Makefile b/fs/Makefile
index ed2b63257ba9..7bbaca9c67b1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -97,7 +97,6 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
obj-$(CONFIG_JFFS2_FS) += jffs2/
-obj-$(CONFIG_LOGFS) += logfs/
obj-$(CONFIG_UBIFS_FS) += ubifs/
obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 69b03dbb792f..ae622cdce142 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -70,7 +70,6 @@ const struct address_space_operations affs_symlink_aops = {
};
const struct inode_operations affs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = affs_notify_change,
};
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2853b4095344..35efb9a31dd7 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -14,7 +14,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
static struct proc_dir_entry *proc_afs;
diff --git a/fs/aio.c b/fs/aio.c
index 428484f2f841..873b4ca82ccb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,7 +42,7 @@
#include <linux/mount.h>
#include <asm/kmap_types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -277,10 +277,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
struct address_space *i_mapping;
if (aio_ring_file) {
- truncate_setsize(aio_ring_file->f_inode, 0);
+ truncate_setsize(file_inode(aio_ring_file), 0);
/* Prevent further access to the kioctx from migratepages */
- i_mapping = aio_ring_file->f_inode->i_mapping;
+ i_mapping = aio_ring_file->f_mapping;
spin_lock(&i_mapping->private_lock);
i_mapping->private_data = NULL;
ctx->aio_ring_file = NULL;
@@ -483,7 +483,7 @@ static int aio_setup_ring(struct kioctx *ctx)
for (i = 0; i < nr_pages; i++) {
struct page *page;
- page = find_or_create_page(file->f_inode->i_mapping,
+ page = find_or_create_page(file->f_mapping,
i, GFP_HIGHUSER | __GFP_ZERO);
if (!page)
break;
@@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
* Tell lockdep we inherited freeze protection from submission
* thread.
*/
- __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ if (S_ISREG(file_inode(file)->i_mode))
+ __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
file_end_write(file);
}
@@ -1285,7 +1286,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
struct timespec __user *timeout)
{
- ktime_t until = { .tv64 = KTIME_MAX };
+ ktime_t until = KTIME_MAX;
long ret = 0;
if (timeout) {
@@ -1311,7 +1312,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until.tv64 == 0)
+ if (until == 0)
aio_read_events(ctx, min_nr, nr, event, &ret);
else
wait_event_interruptible_hrtimeout(ctx->wait,
@@ -1367,6 +1368,39 @@ out:
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
+{
+ struct kioctx *ioctx = NULL;
+ unsigned long ctx;
+ long ret;
+
+ ret = get_user(ctx, ctx32p);
+ if (unlikely(ret))
+ goto out;
+
+ ret = -EINVAL;
+ if (unlikely(ctx || nr_events == 0)) {
+ pr_debug("EINVAL: ctx %lu nr_events %u\n",
+ ctx, nr_events);
+ goto out;
+ }
+
+ ioctx = ioctx_alloc(nr_events);
+ ret = PTR_ERR(ioctx);
+ if (!IS_ERR(ioctx)) {
+ /* truncating is ok because it's a user address */
+ ret = put_user((u32)ioctx->user_id, ctx32p);
+ if (ret)
+ kill_ioctx(current->mm, ioctx, NULL);
+ percpu_ref_put(&ioctx->users);
+ }
+
+out:
+ return ret;
+}
+#endif
+
/* sys_io_destroy:
* Destroy the aio_context specified. May cancel any outstanding
* AIOs and block on completion. Will fail with -ENOSYS if not
@@ -1492,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
* by telling it the lock got released so that it doesn't
* complain about held lock when we return to userspace.
*/
- __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ if (S_ISREG(file_inode(file)->i_mode))
+ __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}
kfree(iovec);
return ret;
@@ -1591,8 +1626,8 @@ out_put_req:
return ret;
}
-long do_io_submit(aio_context_t ctx_id, long nr,
- struct iocb __user *__user *iocbpp, bool compat)
+static long do_io_submit(aio_context_t ctx_id, long nr,
+ struct iocb __user *__user *iocbpp, bool compat)
{
struct kioctx *ctx;
long ret = 0;
@@ -1662,6 +1697,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
return do_io_submit(ctx_id, nr, iocbpp, 0);
}
+#ifdef CONFIG_COMPAT
+static inline long
+copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+{
+ compat_uptr_t uptr;
+ int i;
+
+ for (i = 0; i < nr; ++i) {
+ if (get_user(uptr, ptr32 + i))
+ return -EFAULT;
+ if (put_user(compat_ptr(uptr), ptr64 + i))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
+
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+ int, nr, u32 __user *, iocb)
+{
+ struct iocb __user * __user *iocb64;
+ long ret;
+
+ if (unlikely(nr < 0))
+ return -EINVAL;
+
+ if (nr > MAX_AIO_SUBMITS)
+ nr = MAX_AIO_SUBMITS;
+
+ iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
+ ret = copy_iocb(nr, iocb, iocb64);
+ if (!ret)
+ ret = do_io_submit(ctx_id, nr, iocb64, 1);
+ return ret;
+}
+#endif
+
/* lookup_kiocb
* Finds a given iocb for cancellation.
*/
@@ -1761,3 +1834,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
}
return ret;
}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct compat_timespec __user *, timeout)
+{
+ struct timespec t;
+ struct timespec __user *ut = NULL;
+
+ if (timeout) {
+ if (compat_get_timespec(&t, timeout))
+ return -EFAULT;
+
+ ut = compat_alloc_user_space(sizeof(*ut));
+ if (copy_to_user(ut, &t, sizeof(t)))
+ return -EFAULT;
+ }
+ return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+}
+#endif
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 80ef38c73e5a..3168ee4e77f4 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -20,7 +20,7 @@
#include <linux/magic.h>
#include <linux/anon_inodes.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static struct vfsmount *anon_inode_mnt __read_mostly;
static struct inode *anon_inode_inode;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a1fba4285277..c885daae68c8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *);
/* Expiration */
int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
+int autofs4_expire_wait(const struct path *path, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
@@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait(struct autofs_sb_info *,
+ const struct path *, enum autofs_notify);
int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index fc09eb77ddf3..6f48d670c941 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -204,7 +204,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
/* Find the topmost mount satisfying test() */
static int find_autofs_mount(const char *pathname,
struct path *res,
- int test(struct path *path, void *data),
+ int test(const struct path *path, void *data),
void *data)
{
struct path path;
@@ -230,12 +230,12 @@ static int find_autofs_mount(const char *pathname,
return err;
}
-static int test_by_dev(struct path *path, void *p)
+static int test_by_dev(const struct path *path, void *p)
{
return path->dentry->d_sb->s_dev == *(dev_t *)p;
}
-static int test_by_type(struct path *path, void *p)
+static int test_by_type(const struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
@@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
ino = autofs4_dentry_ino(path.dentry);
if (ino) {
err = 0;
- autofs4_expire_wait(path.dentry, 0);
+ autofs4_expire_wait(&path, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid =
from_kuid_munged(current_user_ns(), ino->uid);
@@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
devid = new_encode_dev(dev);
- err = have_submounts(path.dentry);
+ err = path_has_submounts(&path);
if (follow_down_one(&path))
magic = path.dentry->d_sb->s_magic;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index d8e6d421c27f..57725d4a8c59 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
now = jiffies;
timeout = sbi->exp_timeout;
- spin_lock(&sbi->fs_lock);
- ino = autofs4_dentry_ino(root);
- /* No point expiring a pending mount */
- if (ino->flags & AUTOFS_INF_PENDING)
- goto out;
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ spin_lock(&sbi->fs_lock);
+ ino = autofs4_dentry_ino(root);
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING) {
+ spin_unlock(&sbi->fs_lock);
+ goto out;
+ }
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
- spin_lock(&sbi->fs_lock);
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ spin_lock(&sbi->fs_lock);
ino->flags |= AUTOFS_INF_EXPIRING;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
return root;
}
+ spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+ spin_unlock(&sbi->fs_lock);
}
out:
- spin_unlock(&sbi->fs_lock);
dput(root);
return NULL;
@@ -495,8 +498,9 @@ found:
return expired;
}
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
+int autofs4_expire_wait(const struct path *path, int rcu_walk)
{
+ struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
@@ -525,7 +529,7 @@ retry:
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
- status = autofs4_wait(sbi, dentry, NFY_NONE);
+ status = autofs4_wait(sbi, path, NFY_NONE);
wait_for_completion(&ino->expire_complete);
pr_debug("expire done status=%d\n", status);
@@ -592,11 +596,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
if (dentry) {
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ const struct path path = { .mnt = mnt, .dentry = dentry };
/* This is synchronous because it makes the daemon a
* little easier
*/
- ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+ ret = autofs4_wait(sbi, &path, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
/* avoid rapid-fire expire attempts if expiry fails */
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 438b5bf675b6..09e7d68dff02 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -94,7 +94,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",indirect");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
- seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
seq_printf(m, ",pipe_ino=-1");
#endif
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a11f73174877..82e8f6edfb48 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file);
static struct dentry *autofs4_lookup(struct inode *,
struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(const struct path *, bool);
static void autofs4_dentry_release(struct dentry *);
const struct file_operations autofs4_root_operations = {
@@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
* it.
*/
spin_lock(&sbi->lookup_lock);
- if (!d_mountpoint(dentry) && simple_empty(dentry)) {
+ if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -269,39 +269,41 @@ next:
return NULL;
}
-static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
+static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
int status = 0;
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- pr_debug("waiting for mount name=%pd\n", dentry);
- status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+ pr_debug("waiting for mount name=%pd\n", path->dentry);
+ status = autofs4_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
}
-static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
+static int do_expire_wait(const struct path *path, bool rcu_walk)
{
+ struct dentry *dentry = path->dentry;
struct dentry *expiring;
expiring = autofs4_lookup_expiring(dentry, rcu_walk);
if (IS_ERR(expiring))
return PTR_ERR(expiring);
if (!expiring)
- return autofs4_expire_wait(dentry, rcu_walk);
+ return autofs4_expire_wait(path, rcu_walk);
else {
+ const struct path this = { .mnt = path->mnt, .dentry = expiring };
/*
* If we are racing with expire the request might not
* be quite complete, but the directory has been removed
* so it must have been successful, just wait for it.
*/
- autofs4_expire_wait(expiring, 0);
+ autofs4_expire_wait(&this, 0);
autofs4_del_expiring(expiring);
dput(expiring);
}
@@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* and the directory was removed, so just go ahead and try
* the mount.
*/
- status = do_expire_wait(dentry, 0);
+ status = do_expire_wait(path, 0);
if (status && status != -EAGAIN)
return NULL;
@@ -362,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry, 0);
+ status = autofs4_mount_wait(path, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -370,28 +372,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
/*
* If the dentry is a symlink it's equivalent to a directory
- * having d_mountpoint() true, so there's no need to call back
- * to the daemon.
+ * having path_is_mountpoint() true, so there's no need to call
+ * back to the daemon.
*/
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
- if (!d_mountpoint(dentry)) {
+ if (!path_is_mountpoint(path)) {
/*
* It's possible that user space hasn't removed directories
* after umounting a rootless multi-mount, although it
- * should. For v5 have_submounts() is sufficient to handle
- * this because the leaves of the directory tree under the
- * mount never trigger mounts themselves (they have an autofs
- * trigger mount mounted on them). But v4 pseudo direct mounts
- * do need the leaves to trigger mounts. In this case we
- * have no choice but to use the list_empty() check and
+ * should. For v5 path_has_submounts() is sufficient to
+ * handle this because the leaves of the directory tree under
+ * the mount never trigger mounts themselves (they have an
+ * autofs trigger mount mounted on them). But v4 pseudo direct
+ * mounts do need the leaves to trigger mounts. In this case
+ * we have no choice but to use the list_empty() check and
* require user space behave.
*/
if (sbi->version > 4) {
- if (have_submounts(dentry)) {
+ if (path_has_submounts(path)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -403,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(dentry, 0);
+ status = autofs4_mount_wait(path, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -421,8 +423,9 @@ done:
return NULL;
}
-static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(const struct path *path, bool rcu_walk)
{
+ struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
@@ -431,20 +434,20 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
- if (!d_mountpoint(dentry))
+ if (!path_is_mountpoint(path))
return -EISDIR;
return 0;
}
/* Wait for pending expires */
- if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+ if (do_expire_wait(path, rcu_walk) == -ECHILD)
return -ECHILD;
/*
* This dentry may be under construction so wait on mount
* completion.
*/
- status = autofs4_mount_wait(dentry, rcu_walk);
+ status = autofs4_mount_wait(path, rcu_walk);
if (status)
return status;
@@ -460,7 +463,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
return 0;
- if (d_mountpoint(dentry))
+ if (path_is_mountpoint(path))
return 0;
inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
@@ -487,7 +490,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* we can avoid needless calls ->d_automount() and avoid
* an incorrect ELOOP error return.
*/
- if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+ if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
(d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 99aab00dc217..ab0b4285a202 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -25,6 +25,5 @@ static const char *autofs4_get_link(struct dentry *dentry,
}
const struct inode_operations autofs4_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = autofs4_get_link
};
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e44271dfceb6..1278335ce366 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
const struct qstr *qstr,
- struct dentry *dentry, enum autofs_notify notify)
+ const struct path *path, enum autofs_notify notify)
{
+ struct dentry *dentry = path->dentry;
struct autofs_wait_queue *wq;
struct autofs_info *ino;
@@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait,
*/
if (notify == NFY_MOUNT) {
struct dentry *new = NULL;
+ struct path this;
int valid = 1;
/*
@@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait,
dentry = new;
}
}
- if (have_submounts(dentry))
+ this.mnt = path->mnt;
+ this.dentry = dentry;
+ if (path_has_submounts(&this))
valid = 0;
if (new)
@@ -345,8 +349,9 @@ static int validate_request(struct autofs_wait_queue **wait,
}
int autofs4_wait(struct autofs_sb_info *sbi,
- struct dentry *dentry, enum autofs_notify notify)
+ const struct path *path, enum autofs_notify notify)
{
+ struct dentry *dentry = path->dentry;
struct autofs_wait_queue *wq;
struct qstr qstr;
char *name;
@@ -405,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
return -EINTR;
}
- ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+ ret = validate_request(&wq, sbi, &qstr, path, notify);
if (ret <= 0) {
if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 8712062275b8..5f685c819298 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -106,6 +106,50 @@ static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
return -EIO;
}
+static const char *bad_inode_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ return ERR_PTR(-EIO);
+}
+
+static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type)
+{
+ return ERR_PTR(-EIO);
+}
+
+static int bad_inode_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, u64 start,
+ u64 len)
+{
+ return -EIO;
+}
+
+static int bad_inode_update_time(struct inode *inode, struct timespec *time,
+ int flags)
+{
+ return -EIO;
+}
+
+static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
+ struct file *file, unsigned int open_flag,
+ umode_t create_mode, int *opened)
+{
+ return -EIO;
+}
+
+static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry,
+ umode_t mode)
+{
+ return -EIO;
+}
+
+static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
+{
+ return -EIO;
+}
+
static const struct inode_operations bad_inode_ops =
{
.create = bad_inode_create,
@@ -118,14 +162,17 @@ static const struct inode_operations bad_inode_ops =
.mknod = bad_inode_mknod,
.rename = bad_inode_rename2,
.readlink = bad_inode_readlink,
- /* follow_link must be no-op, otherwise unmounting this inode
- won't work */
- /* put_link returns void */
- /* truncate returns void */
.permission = bad_inode_permission,
.getattr = bad_inode_getattr,
.setattr = bad_inode_setattr,
.listxattr = bad_inode_listxattr,
+ .get_link = bad_inode_get_link,
+ .get_acl = bad_inode_get_acl,
+ .fiemap = bad_inode_fiemap,
+ .update_time = bad_inode_update_time,
+ .atomic_open = bad_inode_atomic_open,
+ .tmpfile = bad_inode_tmpfile,
+ .set_acl = bad_inode_set_acl,
};
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index c6bad51d8ec7..b914cfb03820 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -129,6 +129,7 @@ static inline befs_inode_addr
blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
{
befs_inode_addr iaddr;
+
iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift;
iaddr.start =
blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift);
@@ -140,7 +141,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
static inline unsigned int
befs_iaddrs_per_block(struct super_block *sb)
{
- return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
+ return BEFS_SB(sb)->block_size / sizeof(befs_disk_inode_addr);
}
#include "endian.h"
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index eb557d9dc8be..69c9d8cde955 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -55,12 +55,12 @@ enum super_flags {
};
#define BEFS_BYTEORDER_NATIVE 0x42494745
-#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
-#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_LE ((__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE))
+#define BEFS_BYTEORDER_NATIVE_BE ((__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE))
#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
-#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
-#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_LE ((__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1))
+#define BEFS_SUPER_MAGIC1_BE ((__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1))
/*
* Flags of inode
@@ -79,7 +79,7 @@ enum inode_flags {
BEFS_INODE_WAS_WRITTEN = 0x00020000,
BEFS_NO_TRANSACTION = 0x00040000,
};
-/*
+/*
* On-Disk datastructures of BeFS
*/
@@ -139,7 +139,7 @@ typedef struct {
} PACKED befs_super_block;
-/*
+/*
* Note: the indirect and dbl_indir block_runs may
* be longer than one block!
*/
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 7e135ea73fdd..d509887c580c 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -12,8 +12,8 @@
*
* Dominic Giampaolo, author of "Practical File System
* Design with the Be File System", for such a helpful book.
- *
- * Marcus J. Ranum, author of the b+tree package in
+ *
+ * Marcus J. Ranum, author of the b+tree package in
* comp.sources.misc volume 10. This code is not copied from that
* work, but it is partially based on it.
*
@@ -38,38 +38,38 @@
*/
/* Befs B+tree structure:
- *
+ *
* The first thing in the tree is the tree superblock. It tells you
* all kinds of useful things about the tree, like where the rootnode
* is located, and the size of the nodes (always 1024 with current version
* of BeOS).
*
* The rest of the tree consists of a series of nodes. Nodes contain a header
- * (struct befs_btree_nodehead), the packed key data, an array of shorts
+ * (struct befs_btree_nodehead), the packed key data, an array of shorts
* containing the ending offsets for each of the keys, and an array of
- * befs_off_t values. In interior nodes, the keys are the ending keys for
- * the childnode they point to, and the values are offsets into the
- * datastream containing the tree.
+ * befs_off_t values. In interior nodes, the keys are the ending keys for
+ * the childnode they point to, and the values are offsets into the
+ * datastream containing the tree.
*/
/* Note:
- *
- * The book states 2 confusing things about befs b+trees. First,
+ *
+ * The book states 2 confusing things about befs b+trees. First,
* it states that the overflow field of node headers is used by internal nodes
* to point to another node that "effectively continues this one". Here is what
* I believe that means. Each key in internal nodes points to another node that
- * contains key values less than itself. Inspection reveals that the last key
- * in the internal node is not the last key in the index. Keys that are
- * greater than the last key in the internal node go into the overflow node.
+ * contains key values less than itself. Inspection reveals that the last key
+ * in the internal node is not the last key in the index. Keys that are
+ * greater than the last key in the internal node go into the overflow node.
* I imagine there is a performance reason for this.
*
- * Second, it states that the header of a btree node is sufficient to
- * distinguish internal nodes from leaf nodes. Without saying exactly how.
+ * Second, it states that the header of a btree node is sufficient to
+ * distinguish internal nodes from leaf nodes. Without saying exactly how.
* After figuring out the first, it becomes obvious that internal nodes have
* overflow nodes and leafnodes do not.
*/
-/*
+/*
* Currently, this code is only good for directory B+trees.
* In order to be used for other BFS indexes, it needs to be extended to handle
* duplicate keys and non-string keytypes (int32, int64, float, double).
@@ -237,8 +237,8 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds,
* with @key (usually the disk block number of an inode).
*
* On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND.
- *
- * Algorithm:
+ *
+ * Algorithm:
* Read the superblock and rootnode of the b+tree.
* Drill down through the interior nodes using befs_find_key().
* Once at the correct leaf node, use befs_find_key() again to get the
@@ -402,12 +402,12 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
*
* Here's how it works: Key_no is the index of the key/value pair to
* return in keybuf/value.
- * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
+ * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
* the number of characters in the key (just a convenience).
*
* Algorithm:
* Get the first leafnode of the tree. See if the requested key is in that
- * node. If not, follow the node->right link to the next leafnode. Repeat
+ * node. If not, follow the node->right link to the next leafnode. Repeat
* until the (key_no)th key is found or the tree is out of keys.
*/
int
@@ -536,7 +536,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
* @node_off: Pointer to offset of current node within datastream. Modified
* by the function.
*
- * Helper function for btree traverse. Moves the current position to the
+ * Helper function for btree traverse. Moves the current position to the
* start of the first leaf node.
*
* Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY.
@@ -592,10 +592,10 @@ befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds,
}
/**
- * befs_leafnode - Determine if the btree node is a leaf node or an
+ * befs_leafnode - Determine if the btree node is a leaf node or an
* interior node
* @node: Pointer to node structure to test
- *
+ *
* Return 1 if leaf, 0 if interior
*/
static int
@@ -656,7 +656,7 @@ befs_bt_valarray(struct befs_btree_node *node)
* @node: Pointer to the node structure to find the keydata array within
*
* Returns a pointer to the start of the keydata array
- * of the node pointed to by the node header
+ * of the node pointed to by the node header
*/
static char *
befs_bt_keydata(struct befs_btree_node *node)
@@ -702,7 +702,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
/**
* befs_compare_strings - compare two strings
- * @key1: pointer to the first key to be compared
+ * @key1: pointer to the first key to be compared
* @keylen1: length in bytes of key1
* @key2: pointer to the second key to be compared
* @keylen2: length in bytes of key2
diff --git a/fs/befs/btree.h b/fs/befs/btree.h
index f2a8f637e9e0..60c6c728e64e 100644
--- a/fs/befs/btree.h
+++ b/fs/befs/btree.h
@@ -1,13 +1,11 @@
/*
* btree.h
- *
+ *
*/
-
int befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
- const char *key, befs_off_t * value);
+ const char *key, befs_off_t *value);
int befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
loff_t key_no, size_t bufsize, char *keybuf,
- size_t * keysize, befs_off_t * value);
-
+ size_t *keysize, befs_off_t *value);
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b4c7ba013c0d..720b3bc5c16a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -84,13 +84,11 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
*
* Takes a file position and gives back a brun who's starting block
* is block number fblock of the file.
- *
+ *
* Returns BEFS_OK or BEFS_ERR.
- *
+ *
* Calls specialized functions for each of the three possible
* datastream regions.
- *
- * 2001-11-15 Will Dyson
*/
int
befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
@@ -120,7 +118,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
/**
* befs_read_lsmylink - read long symlink from datastream.
- * @sb: Filesystem superblock
+ * @sb: Filesystem superblock
* @ds: Datastream to read from
* @buff: Buffer in which to place long symlink data
* @len: Length of the long symlink in bytes
diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h
index 91ba8203d83f..7ff9ff09ec6e 100644
--- a/fs/befs/datastream.h
+++ b/fs/befs/datastream.h
@@ -5,10 +5,10 @@
struct buffer_head *befs_read_datastream(struct super_block *sb,
const befs_data_stream *ds,
- befs_off_t pos, uint * off);
+ befs_off_t pos, uint *off);
int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
- befs_blocknr_t fblock, befs_block_run * run);
+ befs_blocknr_t fblock, befs_block_run *run);
size_t befs_read_lsymlink(struct super_block *sb, const befs_data_stream *data,
void *buff, befs_off_t len);
@@ -17,4 +17,3 @@ befs_blocknr_t befs_count_blocks(struct super_block *sb,
const befs_data_stream *ds);
extern const befs_inode_addr BAD_IADDR;
-
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 85c13392e9e8..36656c86f50e 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -1,6 +1,6 @@
/*
* linux/fs/befs/debug.c
- *
+ *
* Copyright (C) 2001 Will Dyson (will_dyson at pobox.com)
*
* With help from the ntfs-tng driver by Anton Altparmakov
@@ -57,6 +57,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
struct va_format vaf;
va_list args;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -67,7 +68,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
}
void
-befs_dump_inode(const struct super_block *sb, befs_inode * inode)
+befs_dump_inode(const struct super_block *sb, befs_inode *inode)
{
#ifdef CONFIG_BEFS_DEBUG
@@ -151,7 +152,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
*/
void
-befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
+befs_dump_super_block(const struct super_block *sb, befs_super_block *sup)
{
#ifdef CONFIG_BEFS_DEBUG
@@ -202,7 +203,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
#if 0
/* unused */
void
-befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
+befs_dump_small_data(const struct super_block *sb, befs_small_data *sd)
{
}
@@ -221,7 +222,8 @@ befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
#endif /* 0 */
void
-befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
+befs_dump_index_entry(const struct super_block *sb,
+ befs_disk_btree_super *super)
{
#ifdef CONFIG_BEFS_DEBUG
@@ -242,7 +244,7 @@ befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * supe
}
void
-befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node)
+befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *node)
{
#ifdef CONFIG_BEFS_DEBUG
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index fa4b718de597..5367a6470a69 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -1,6 +1,6 @@
/*
* inode.c
- *
+ *
* Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
*/
@@ -10,12 +10,12 @@
#include "inode.h"
/*
- Validates the correctness of the befs inode
- Returns BEFS_OK if the inode should be used, otherwise
- returns BEFS_BAD_INODE
-*/
+ * Validates the correctness of the befs inode
+ * Returns BEFS_OK if the inode should be used, otherwise
+ * returns BEFS_BAD_INODE
+ */
int
-befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+befs_check_inode(struct super_block *sb, befs_inode *raw_inode,
befs_blocknr_t inode)
{
u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1);
diff --git a/fs/befs/inode.h b/fs/befs/inode.h
index 9dc7fd9b7570..2219e412f49b 100644
--- a/fs/befs/inode.h
+++ b/fs/befs/inode.h
@@ -1,8 +1,7 @@
/*
* inode.h
- *
+ *
*/
-int befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+int befs_check_inode(struct super_block *sb, befs_inode *raw_inode,
befs_blocknr_t inode);
-
diff --git a/fs/befs/io.c b/fs/befs/io.c
index b4a558126ee1..227cb86e07fe 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
*
- * Based on portions of file.c and inode.c
+ * Based on portions of file.c and inode.c
* by Makoto Kato (m_kato@ga2.so-net.ne.jp)
*
* Many thanks to Dominic Giampaolo, author of Practical File System
@@ -19,8 +19,7 @@
/*
* Converts befs notion of disk addr to a disk offset and uses
* linux kernel function sb_bread() to get the buffer containing
- * the offset. -Will Dyson
- *
+ * the offset.
*/
struct buffer_head *
@@ -55,7 +54,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
befs_debug(sb, "<--- %s", __func__);
return bh;
- error:
+error:
befs_debug(sb, "<--- %s ERROR", __func__);
return NULL;
}
diff --git a/fs/befs/io.h b/fs/befs/io.h
index 78d7bc6e60de..9b3e1967cb31 100644
--- a/fs/befs/io.h
+++ b/fs/befs/io.h
@@ -4,4 +4,3 @@
struct buffer_head *befs_bread_iaddr(struct super_block *sb,
befs_inode_addr iaddr);
-
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 647a276eba56..19407165f4aa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -18,6 +18,7 @@
#include <linux/parser.h>
#include <linux/namei.h>
#include <linux/sched.h>
+#include <linux/exportfs.h>
#include "befs.h"
#include "btree.h"
@@ -37,7 +38,8 @@ static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
static int befs_readpage(struct file *file, struct page *page);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
+static struct dentry *befs_lookup(struct inode *, struct dentry *,
+ unsigned int);
static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
@@ -51,6 +53,10 @@ static void befs_put_super(struct super_block *);
static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
static int parse_options(char *, struct befs_mount_options *);
+static struct dentry *befs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
@@ -83,9 +89,14 @@ static const struct address_space_operations befs_symlink_aops = {
.readpage = befs_symlink_readpage,
};
-/*
+static const struct export_operations befs_export_operations = {
+ .fh_to_dentry = befs_fh_to_dentry,
+ .fh_to_parent = befs_fh_to_parent,
+};
+
+/*
* Called by generic_file_read() to read a page of data
- *
+ *
* In turn, simply calls a generic block read function and
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
@@ -102,15 +113,13 @@ befs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, befs_get_block);
}
-/*
- * Generic function to map a file position (block) to a
+/*
+ * Generic function to map a file position (block) to a
* disk offset (passed back in bh_result).
*
* Used by many higher level functions.
*
* Calls befs_fblock2brun() in datastream.c to do the real work.
- *
- * -WD 10-26-01
*/
static int
@@ -269,15 +278,15 @@ befs_alloc_inode(struct super_block *sb)
struct befs_inode_info *bi;
bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
- if (!bi)
- return NULL;
- return &bi->vfs_inode;
+ if (!bi)
+ return NULL;
+ return &bi->vfs_inode;
}
static void befs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
+ kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
}
static void befs_destroy_inode(struct inode *inode)
@@ -287,7 +296,7 @@ static void befs_destroy_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct befs_inode_info *bi = (struct befs_inode_info *) foo;
+ struct befs_inode_info *bi = (struct befs_inode_info *) foo;
inode_init_once(&bi->vfs_inode);
}
@@ -338,7 +347,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
/*
* set uid and gid. But since current BeOS is single user OS, so
* you can change by "uid" or "gid" options.
- */
+ */
inode->i_uid = befs_sb->mount_opts.use_uid ?
befs_sb->mount_opts.uid :
@@ -353,14 +362,14 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
* BEFS's time is 64 bits, but current VFS is 32 bits...
* BEFS don't have access time. Nor inode change time. VFS
* doesn't have creation time.
- * Also, the lower 16 bits of the last_modified_time and
+ * Also, the lower 16 bits of the last_modified_time and
* create_time are just a counter to help ensure uniqueness
* for indexing purposes. (PFD, page 54)
*/
inode->i_mtime.tv_sec =
fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
- inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
+ inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
inode->i_ctime = inode->i_mtime;
inode->i_atime = inode->i_mtime;
@@ -414,10 +423,10 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
unlock_new_inode(inode);
return inode;
- unacquire_bh:
+unacquire_bh:
brelse(bh);
- unacquire_none:
+unacquire_none:
iget_failed(inode);
befs_debug(sb, "<--- %s - Bad inode", __func__);
return ERR_PTR(-EIO);
@@ -442,7 +451,7 @@ befs_init_inodecache(void)
}
/* Called at fs teardown.
- *
+ *
* Taken from NFS implementation by Al Viro.
*/
static void
@@ -491,13 +500,10 @@ fail:
}
/*
- * UTF-8 to NLS charset convert routine
- *
+ * UTF-8 to NLS charset convert routine
*
- * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than
- * the nls tables directly
+ * Uses uni2char() / char2uni() rather than the nls tables directly
*/
-
static int
befs_utf2nls(struct super_block *sb, const char *in,
int in_len, char **out, int *out_len)
@@ -521,9 +527,8 @@ befs_utf2nls(struct super_block *sb, const char *in,
}
*out = result = kmalloc(maxlen, GFP_NOFS);
- if (!*out) {
+ if (!*out)
return -ENOMEM;
- }
for (i = o = 0; i < in_len; i += utflen, o += unilen) {
@@ -546,7 +551,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
return o;
- conv_err:
+conv_err:
befs_error(sb, "Name using character set %s contains a character that "
"cannot be converted to unicode.", nls->charset);
befs_debug(sb, "<--- %s", __func__);
@@ -561,18 +566,18 @@ befs_utf2nls(struct super_block *sb, const char *in,
* @in_len: Length of input string in bytes
* @out: The output string in UTF-8 format
* @out_len: Length of the output buffer
- *
+ *
* Converts input string @in, which is in the format of the loaded NLS map,
* into a utf8 string.
- *
+ *
* The destination string @out is allocated by this function and the caller is
* responsible for freeing it with kfree()
- *
+ *
* On return, *@out_len is the length of @out in bytes.
*
* On success, the return value is the number of utf8 characters written to
* the output buffer @out.
- *
+ *
* On Failure, a negative number coresponding to the error code is returned.
*/
@@ -585,9 +590,11 @@ befs_nls2utf(struct super_block *sb, const char *in,
wchar_t uni;
int unilen, utflen;
char *result;
- /* There're nls characters that will translate to 3-chars-wide UTF-8
- * characters, a additional byte is needed to save the final \0
- * in special cases */
+ /*
+ * There are nls characters that will translate to 3-chars-wide UTF-8
+ * characters, an additional byte is needed to save the final \0
+ * in special cases
+ */
int maxlen = (3 * in_len) + 1;
befs_debug(sb, "---> %s\n", __func__);
@@ -624,14 +631,41 @@ befs_nls2utf(struct super_block *sb, const char *in,
return i;
- conv_err:
- befs_error(sb, "Name using charecter set %s contains a charecter that "
+conv_err:
+ befs_error(sb, "Name using character set %s contains a character that "
"cannot be converted to unicode.", nls->charset);
befs_debug(sb, "<--- %s", __func__);
kfree(result);
return -EILSEQ;
}
+static struct inode *befs_nfs_get_inode(struct super_block *sb, uint64_t ino,
+ uint32_t generation)
+{
+ /* No need to handle i_generation */
+ return befs_iget(sb, ino);
+}
+
+/*
+ * Map a NFS file handle to a corresponding dentry
+ */
+static struct dentry *befs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ befs_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a file specified by NFS handle
+ */
+static struct dentry *befs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ befs_nfs_get_inode);
+}
+
enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
@@ -666,6 +700,7 @@ parse_options(char *options, struct befs_mount_options *opts)
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -721,7 +756,7 @@ parse_options(char *options, struct befs_mount_options *opts)
}
/* This function has the responsibiltiy of getting the
- * filesystem ready for unmounting.
+ * filesystem ready for unmounting.
* Basically, we free everything that we allocated in
* befs_read_inode
*/
@@ -782,8 +817,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
* Linux 2.4.10 and later refuse to read blocks smaller than
* the logical block size for the device. But we also need to read at
* least 1k to get the second 512 bytes of the volume.
- * -WD 10-26-01
- */
+ */
blocksize = sb_min_blocksize(sb, 1024);
if (!blocksize) {
if (!silent)
@@ -791,7 +825,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
goto unacquire_priv_sbp;
}
- if (!(bh = sb_bread(sb, sb_block))) {
+ bh = sb_bread(sb, sb_block);
+ if (!bh) {
if (!silent)
befs_error(sb, "unable to read superblock");
goto unacquire_priv_sbp;
@@ -816,7 +851,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
- if( befs_sb->num_blocks > ~((sector_t)0) ) {
+ if (befs_sb->num_blocks > ~((sector_t)0)) {
if (!silent)
befs_error(sb, "blocks count: %llu is larger than the host can use",
befs_sb->num_blocks);
@@ -831,6 +866,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
/* Set real blocksize of fs */
sb_set_blocksize(sb, (ulong) befs_sb->block_size);
sb->s_op = &befs_sops;
+ sb->s_export_op = &befs_export_operations;
root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -861,16 +897,16 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
}
return 0;
-/*****************/
- unacquire_bh:
+
+unacquire_bh:
brelse(bh);
- unacquire_priv_sbp:
+unacquire_priv_sbp:
kfree(befs_sb->mount_opts.iocharset);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
- unacquire_none:
+unacquire_none:
return ret;
}
@@ -919,7 +955,7 @@ static struct file_system_type befs_fs_type = {
.name = "befs",
.mount = befs_mount,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("befs");
@@ -956,9 +992,9 @@ exit_befs_fs(void)
}
/*
-Macros that typecheck the init and exit functions,
-ensures that they are called at init and cleanup,
-and eliminates warnings about unused functions.
-*/
+ * Macros that typecheck the init and exit functions,
+ * ensures that they are called at init and cleanup,
+ * and eliminates warnings about unused functions.
+ */
module_init(init_befs_fs)
module_exit(exit_befs_fs)
diff --git a/fs/befs/super.h b/fs/befs/super.h
index dc4556376a22..ec1df30a7e9a 100644
--- a/fs/befs/super.h
+++ b/fs/befs/super.h
@@ -2,7 +2,5 @@
* super.h
*/
-int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb);
-
+int befs_load_sb(struct super_block *sb, befs_super_block *disk_sb);
int befs_check_sb(struct super_block *sb);
-
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1e5c896f6b79..f2deec0a62f0 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -16,7 +16,7 @@
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "bfs.h"
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b5404fced..2a59139f520b 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -26,7 +26,7 @@
#include <linux/coredump.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/a.out-core.h>
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..422370293cfd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,7 +36,7 @@
#include <linux/coredump.h>
#include <linux/sched.h>
#include <linux/dax.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+ if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
+ goto end_coredump;
+ vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
if (!vma_filesz)
goto end_coredump;
@@ -2296,6 +2298,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
}
+ dump_truncate(cprm);
if (!elf_core_write_extra_data(cprm))
goto end_coredump;
@@ -2311,7 +2314,7 @@ end_coredump:
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
- kfree(vma_filesz);
+ vfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 464a972e88c1..d2e36f82c35d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -37,7 +37,7 @@
#include <linux/coredump.h>
#include <linux/dax.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/param.h>
#include <asm/pgalloc.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05b553368bb4..3c47614a4b32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -30,8 +30,9 @@
#include <linux/cleancache.h>
#include <linux/dax.h>
#include <linux/badblocks.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
struct bdev_inode {
@@ -175,17 +176,273 @@ static struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host;
}
+static unsigned int dio_bio_write_op(struct kiocb *iocb)
+{
+ unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+ /* avoid the need for a I/O completion work item */
+ if (iocb->ki_flags & IOCB_DSYNC)
+ op |= REQ_FUA;
+ return op;
+}
+
+#define DIO_INLINE_BIO_VECS 4
+
+static void blkdev_bio_end_io_simple(struct bio *bio)
+{
+ struct task_struct *waiter = bio->bi_private;
+
+ WRITE_ONCE(bio->bi_private, NULL);
+ wake_up_process(waiter);
+}
+
static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+ int nr_pages)
+{
+ struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+ struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
+ loff_t pos = iocb->ki_pos;
+ bool should_dirty = false;
+ struct bio bio;
+ ssize_t ret;
+ blk_qc_t qc;
+ int i;
+
+ if ((pos | iov_iter_alignment(iter)) &
+ (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ if (nr_pages <= DIO_INLINE_BIO_VECS)
+ vecs = inline_vecs;
+ else {
+ vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
+ if (!vecs)
+ return -ENOMEM;
+ }
+
+ bio_init(&bio, vecs, nr_pages);
+ bio.bi_bdev = bdev;
+ bio.bi_iter.bi_sector = pos >> 9;
+ bio.bi_private = current;
+ bio.bi_end_io = blkdev_bio_end_io_simple;
+
+ ret = bio_iov_iter_get_pages(&bio, iter);
+ if (unlikely(ret))
+ return ret;
+ ret = bio.bi_iter.bi_size;
+
+ if (iov_iter_rw(iter) == READ) {
+ bio.bi_opf = REQ_OP_READ;
+ if (iter_is_iovec(iter))
+ should_dirty = true;
+ } else {
+ bio.bi_opf = dio_bio_write_op(iocb);
+ task_io_account_write(ret);
+ }
+
+ qc = submit_bio(&bio);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(bio.bi_private))
+ break;
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_mq_poll(bdev_get_queue(bdev), qc))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ bio_for_each_segment_all(bvec, &bio, i) {
+ if (should_dirty && !PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
+ put_page(bvec->bv_page);
+ }
+
+ if (vecs != inline_vecs)
+ kfree(vecs);
+
+ if (unlikely(bio.bi_error))
+ return bio.bi_error;
+ return ret;
+}
+
+struct blkdev_dio {
+ union {
+ struct kiocb *iocb;
+ struct task_struct *waiter;
+ };
+ size_t size;
+ atomic_t ref;
+ bool multi_bio : 1;
+ bool should_dirty : 1;
+ bool is_sync : 1;
+ struct bio bio;
+};
+
+static struct bio_set *blkdev_dio_pool __read_mostly;
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+ struct blkdev_dio *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
+ if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
+ if (bio->bi_error && !dio->bio.bi_error)
+ dio->bio.bi_error = bio->bi_error;
+ } else {
+ if (!dio->is_sync) {
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret = dio->bio.bi_error;
+
+ if (likely(!ret)) {
+ ret = dio->size;
+ iocb->ki_pos += ret;
+ }
+
+ dio->iocb->ki_complete(iocb, ret, 0);
+ bio_put(&dio->bio);
+ } else {
+ struct task_struct *waiter = dio->waiter;
+
+ WRITE_ONCE(dio->waiter, NULL);
+ wake_up_process(waiter);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+}
+
+static ssize_t
+__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(inode);
+ struct blk_plug plug;
+ struct blkdev_dio *dio;
+ struct bio *bio;
+ bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ loff_t pos = iocb->ki_pos;
+ blk_qc_t qc = BLK_QC_T_NONE;
+ int ret;
+
+ if ((pos | iov_iter_alignment(iter)) &
+ (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+ bio_get(bio); /* extra ref for the completion handler */
+
+ dio = container_of(bio, struct blkdev_dio, bio);
+ dio->is_sync = is_sync = is_sync_kiocb(iocb);
+ if (dio->is_sync)
+ dio->waiter = current;
+ else
+ dio->iocb = iocb;
+
+ dio->size = 0;
+ dio->multi_bio = false;
+ dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+
+ blk_start_plug(&plug);
+ for (;;) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = pos >> 9;
+ bio->bi_private = dio;
+ bio->bi_end_io = blkdev_bio_end_io;
+
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(ret)) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ break;
+ }
+
+ if (is_read) {
+ bio->bi_opf = REQ_OP_READ;
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
+ } else {
+ bio->bi_opf = dio_bio_write_op(iocb);
+ task_io_account_write(bio->bi_iter.bi_size);
+ }
+
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ if (!nr_pages) {
+ qc = submit_bio(bio);
+ break;
+ }
+
+ if (!dio->multi_bio) {
+ dio->multi_bio = true;
+ atomic_set(&dio->ref, 2);
+ } else {
+ atomic_inc(&dio->ref);
+ }
+
+ submit_bio(bio);
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ }
+ blk_finish_plug(&plug);
+
+ if (!is_sync)
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->waiter))
+ break;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_mq_poll(bdev_get_queue(bdev), qc))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ ret = dio->bio.bi_error;
+ if (likely(!ret))
+ ret = dio->size;
+
+ bio_put(&dio->bio);
+ return ret;
+}
- return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
- blkdev_get_block, NULL, NULL,
- DIO_SKIP_DIO_COUNT);
+static ssize_t
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int nr_pages;
+
+ nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
+ if (!nr_pages)
+ return 0;
+ if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+ return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+
+ return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
}
+static __init int blkdev_init(void)
+{
+ blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+ if (!blkdev_dio_pool)
+ return -ENOMEM;
+ return 0;
+}
+module_init(blkdev_init);
+
int __sync_blockdev(struct block_device *bdev, int wait)
{
if (!bdev)
@@ -832,7 +1089,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
return true; /* already a holder */
else if (bdev->bd_holder != NULL)
return false; /* held by someone else */
- else if (bdev->bd_contains == bdev)
+ else if (whole == bdev)
return true; /* is a whole device which isn't held */
else if (whole->bd_holder == bd_may_claim)
@@ -1950,6 +2207,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
@@ -1970,8 +2228,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
*/
iput(old_inode);
old_inode = inode;
+ bdev = I_BDEV(inode);
- func(I_BDEV(inode), arg);
+ mutex_lock(&bdev->bd_mutex);
+ if (bdev->bd_openers)
+ func(bdev, arg);
+ mutex_unlock(&bdev->bd_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock);
}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e0f071f6b5a7..ff0b0be92d61 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -86,6 +86,20 @@ btrfs_work_owner(struct btrfs_work *work)
return work->wq->fs_info;
}
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
+{
+ /*
+ * We could compare wq->normal->pending with num_online_cpus()
+ * to support "thresh == NO_THRESHOLD" case, but it requires
+ * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
+ * postpone it until someone needs the support of that case.
+ */
+ if (wq->normal->thresh == NO_THRESHOLD)
+ return false;
+
+ return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+}
+
BTRFS_WORK_HELPER(worker_helper);
BTRFS_WORK_HELPER(delalloc_helper);
BTRFS_WORK_HELPER(flush_delalloc_helper);
@@ -259,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
unsigned long flags;
while (1) {
+ void *wtag;
+
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
@@ -285,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
spin_unlock_irqrestore(lock, flags);
/*
- * we don't want to call the ordered free functions
- * with the lock held though
+ * We don't want to call the ordered free functions with the
+ * lock held though. Save the work as tag for the trace event,
+ * because the callback could free the structure.
*/
+ wtag = work;
work->ordered_free(work);
- trace_btrfs_all_work_done(work);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
}
spin_unlock_irqrestore(lock, flags);
}
@@ -297,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
static void normal_work_helper(struct btrfs_work *work)
{
struct __btrfs_workqueue *wq;
+ void *wtag;
int need_order = 0;
/*
@@ -310,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work)
if (work->ordered_func)
need_order = 1;
wq = work->wq;
+ /* Safe for tracepoints in case work gets freed by the callback */
+ wtag = work;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@@ -319,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work)
run_ordered_work(wq);
}
if (!need_order)
- trace_btrfs_all_work_done(work);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
}
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 8e52484cd461..1f9597355c9d 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -84,4 +84,5 @@ void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 85dc7ab8f89e..8299601a3549 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -788,8 +788,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
if (ref->key_for_search.type)
continue;
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
- 0);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -1405,8 +1404,7 @@ again:
ref->level == 0) {
struct extent_buffer *eb;
- eb = read_tree_block(fs_info->extent_root,
- ref->parent, 0);
+ eb = read_tree_block(fs_info, ref->parent, 0);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1829,7 +1827,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
- size = fs_info->extent_root->nodesize;
+ size = fs_info->nodesize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
@@ -2058,7 +2056,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- btrfs_end_transaction(trans, fs_info->extent_root);
+ btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 8e99251650b3..ab14c2e635ca 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -254,7 +254,7 @@ struct btrfsic_state {
struct list_head all_blocks_list;
struct btrfsic_block_hashtable block_hashtable;
struct btrfsic_block_link_hashtable block_link_hashtable;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
u64 max_superblock_generation;
struct btrfsic_block *latest_superblock;
u32 metablock_size;
@@ -646,11 +646,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfs_fs_devices *fs_devices)
{
- int ret = 0;
+ struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *selected_super;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfsic_dev_state *selected_dev_state = NULL;
+ int ret = 0;
int pass;
BUG_ON(NULL == state);
@@ -716,9 +717,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
break;
}
- num_copies =
- btrfs_num_copies(state->root->fs_info,
- next_bytenr, state->metablock_size);
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
next_bytenr, num_copies);
@@ -783,6 +783,7 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfsic_dev_state **selected_dev_state,
struct btrfs_super_block *selected_super)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *super_tmp;
u64 dev_bytenr;
struct buffer_head *bh;
@@ -832,7 +833,7 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- btrfs_info_in_rcu(device->dev_root->fs_info,
+ btrfs_info_in_rcu(fs_info,
"new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
@@ -887,9 +888,8 @@ static int btrfsic_process_superblock_dev_mirror(
break;
}
- num_copies =
- btrfs_num_copies(state->root->fs_info,
- next_bytenr, state->metablock_size);
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
next_bytenr, num_copies);
@@ -1254,6 +1254,7 @@ static int btrfsic_create_link_to_next_block(
struct btrfs_disk_key *disk_key,
u64 parent_generation)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfsic_block *next_block = NULL;
int ret;
struct btrfsic_block_link *l;
@@ -1262,9 +1263,8 @@ static int btrfsic_create_link_to_next_block(
*next_blockp = NULL;
if (0 == *num_copiesp) {
- *num_copiesp =
- btrfs_num_copies(state->root->fs_info,
- next_bytenr, state->metablock_size);
+ *num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
+ state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
next_bytenr, *num_copiesp);
@@ -1390,13 +1390,14 @@ static int btrfsic_handle_extent_data(
struct btrfsic_block_data_ctx *block_ctx,
u32 item_offset, int force_iodone_flag)
{
- int ret;
+ struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_file_extent_item file_extent_item;
u64 file_extent_item_offset;
u64 next_bytenr;
u64 num_bytes;
u64 generation;
struct btrfsic_block_link *l;
+ int ret;
file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
item_offset;
@@ -1456,9 +1457,8 @@ static int btrfsic_handle_extent_data(
else
chunk_len = num_bytes;
- num_copies =
- btrfs_num_copies(state->root->fs_info,
- next_bytenr, state->datablock_size);
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ state->datablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
next_bytenr, num_copies);
@@ -1533,13 +1533,14 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfsic_block_data_ctx *block_ctx_out,
int mirror_num)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
struct btrfs_bio *multi = NULL;
struct btrfs_device *device;
length = len;
- ret = btrfs_map_block(state->root->fs_info, READ,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
bytenr, &length, &multi, mirror_num);
if (ret) {
@@ -1731,6 +1732,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
char **datav, unsigned int num_pages)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_header *h;
u8 csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
@@ -1741,7 +1743,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
num_pages = state->metablock_size >> PAGE_SHIFT;
h = (struct btrfs_header *)datav[0];
- if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+ if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
return 1;
for (i = 0; i < num_pages; i++) {
@@ -2202,6 +2204,7 @@ static int btrfsic_process_written_superblock(
struct btrfsic_block *const superblock,
struct btrfs_super_block *const super_hdr)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
int pass;
superblock->generation = btrfs_super_generation(super_hdr);
@@ -2275,9 +2278,8 @@ static int btrfsic_process_written_superblock(
break;
}
- num_copies =
- btrfs_num_copies(state->root->fs_info,
- next_bytenr, BTRFS_SUPER_INFO_SIZE);
+ num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ BTRFS_SUPER_INFO_SIZE);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
next_bytenr, num_copies);
@@ -2699,14 +2701,14 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
struct btrfsic_dev_state *dev_state,
u64 dev_bytenr)
{
+ struct btrfs_fs_info *fs_info = state->fs_info;
+ struct btrfsic_block_data_ctx block_ctx;
int num_copies;
int mirror_num;
- int ret;
- struct btrfsic_block_data_ctx block_ctx;
int match = 0;
+ int ret;
- num_copies = btrfs_num_copies(state->root->fs_info,
- bytenr, state->metablock_size);
+ num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr, state->metablock_size,
@@ -2819,10 +2821,11 @@ static void __btrfsic_submit_bio(struct bio *bio)
* btrfsic_mount(), this might return NULL */
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) {
+ (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i;
u64 dev_bytenr;
u64 cur_bytenr;
+ struct bio_vec *bvec;
int bio_is_patched;
char **mapped_datav;
@@ -2840,32 +2843,23 @@ static void __btrfsic_submit_bio(struct bio *bio)
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
- for (i = 0; i < bio->bi_vcnt; i++) {
- BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
- if (!mapped_datav[i]) {
- while (i > 0) {
- i--;
- kunmap(bio->bi_io_vec[i].bv_page);
- }
- kfree(mapped_datav);
- goto leave;
- }
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ BUG_ON(bvec->bv_len != PAGE_SIZE);
+ mapped_datav[i] = kmap(bvec->bv_page);
+
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bio->bi_io_vec[i].bv_len,
- bio->bi_io_vec[i].bv_offset);
- cur_bytenr += bio->bi_io_vec[i].bv_len;
+ i, cur_bytenr, bvec->bv_len, bvec->bv_offset);
+ cur_bytenr += bvec->bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, bio->bi_vcnt,
bio, &bio_is_patched,
NULL, bio->bi_opf);
- while (i > 0) {
- i--;
- kunmap(bio->bi_io_vec[i].bv_page);
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ kunmap(bvec->bv_page);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
@@ -2910,7 +2904,7 @@ int btrfsic_submit_bio_wait(struct bio *bio)
return submit_bio_wait(bio);
}
-int btrfsic_mount(struct btrfs_root *root,
+int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices,
int including_extent_data, u32 print_mask)
{
@@ -2919,14 +2913,14 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize & ((u64)PAGE_SIZE - 1)) {
+ if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) {
pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
- root->nodesize, PAGE_SIZE);
+ fs_info->nodesize, PAGE_SIZE);
return -1;
}
- if (root->sectorsize & ((u64)PAGE_SIZE - 1)) {
+ if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) {
pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
- root->sectorsize, PAGE_SIZE);
+ fs_info->sectorsize, PAGE_SIZE);
return -1;
}
state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
@@ -2944,12 +2938,12 @@ int btrfsic_mount(struct btrfs_root *root,
btrfsic_is_initialized = 1;
}
mutex_lock(&btrfsic_mutex);
- state->root = root;
+ state->fs_info = fs_info;
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
state->csum_size = 0;
- state->metablock_size = root->nodesize;
- state->datablock_size = root->sectorsize;
+ state->metablock_size = fs_info->nodesize;
+ state->datablock_size = fs_info->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
btrfsic_block_hashtable_init(&state->block_hashtable);
btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -2982,7 +2976,7 @@ int btrfsic_mount(struct btrfs_root *root,
ret = btrfsic_process_superblock(state, fs_devices);
if (0 != ret) {
mutex_unlock(&btrfsic_mutex);
- btrfsic_unmount(root, fs_devices);
+ btrfsic_unmount(fs_devices);
return ret;
}
@@ -2995,8 +2989,7 @@ int btrfsic_mount(struct btrfs_root *root,
return 0;
}
-void btrfsic_unmount(struct btrfs_root *root,
- struct btrfs_fs_devices *fs_devices)
+void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
{
struct btrfsic_block *b_all, *tmp_all;
struct btrfsic_state *state;
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index f78dff1c7e86..2de58a99ee92 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -29,10 +29,9 @@ int btrfsic_submit_bio_wait(struct bio *bio);
#define btrfsic_submit_bio_wait submit_bio_wait
#endif
-int btrfsic_mount(struct btrfs_root *root,
+int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices,
int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_root *root,
- struct btrfs_fs_devices *fs_devices);
+void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d4d8b7e36b2f..c4444d6f439f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -81,17 +81,17 @@ struct compressed_bio {
u32 sums;
};
-static int btrfs_decompress_biovec(int type, struct page **pages_in,
- u64 disk_start, struct bio_vec *bvec,
- int vcnt, size_t srclen);
+static int btrfs_decompress_bio(int type, struct page **pages_in,
+ u64 disk_start, struct bio *orig_bio,
+ size_t srclen);
-static inline int compressed_bio_size(struct btrfs_root *root,
+static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
{
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
+ (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
}
static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -120,7 +120,7 @@ static int check_compressed_csum(struct inode *inode,
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
- btrfs_csum_final(csum, (char *)&csum);
+ btrfs_csum_final(csum, (u8 *)&csum);
kunmap_atomic(kaddr);
if (csum != *cb_sum) {
@@ -175,11 +175,10 @@ static void end_compressed_bio_read(struct bio *bio)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
- ret = btrfs_decompress_biovec(cb->compress_type,
+ ret = btrfs_decompress_bio(cb->compress_type,
cb->compressed_pages,
cb->start,
- cb->orig_bio->bi_io_vec,
- cb->orig_bio->bi_vcnt,
+ cb->orig_bio,
cb->compressed_len);
csum_failed:
if (ret)
@@ -329,8 +328,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct page **compressed_pages,
unsigned long nr_pages)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio *bio = NULL;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct compressed_bio *cb;
unsigned long bytes_left;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -342,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
WARN_ON(start & ((u64)PAGE_SIZE - 1));
- cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return -ENOMEM;
atomic_set(&cb->pending_bios, 0);
@@ -356,7 +355,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_bdev;
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
if (!bio) {
@@ -392,17 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* freed before we're done setting it up
*/
atomic_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
+ ret = btrfs_bio_wq_end_io(fs_info, bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
- ret = btrfs_csum_one_bio(root, inode, bio,
- start, 1);
+ ret = btrfs_csum_one_bio(inode, bio, start, 1);
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -418,7 +416,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_info(fs_info,
"bytes left %lu compress len %lu nr %lu",
bytes_left, cb->compressed_len, cb->nr_pages);
}
@@ -428,15 +426,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
- ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+ ret = btrfs_csum_one_bio(inode, bio, start, 1);
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -446,6 +444,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
return 0;
}
+static u64 bio_end_offset(struct bio *bio)
+{
+ struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
+}
+
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
@@ -464,8 +469,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
u64 end;
int misses = 0;
- page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
- last_offset = (page_offset(page) + PAGE_SIZE);
+ last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
@@ -563,7 +567,6 @@ next:
*
* bio->bi_iter.bi_sector points to the compressed extent on disk
* bio->bi_io_vec points to all of the inode pages
- * bio->bi_vcnt is a count of pages
*
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
@@ -571,11 +574,10 @@ next:
int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
unsigned long compressed_len;
unsigned long nr_pages;
unsigned long pg_index;
@@ -603,7 +605,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
return -EIO;
compressed_len = em->block_len;
- cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
goto out;
@@ -620,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
free_extent_map(em);
em = NULL;
- cb->len = uncompressed_len;
+ cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
@@ -631,7 +633,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb->compressed_pages)
goto fail1;
- bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_bdev;
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
@@ -648,8 +650,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
- uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
- cb->len = uncompressed_len;
+ cb->len = bio->bi_iter.bi_size;
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
if (!comp_bio)
@@ -676,8 +677,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
PAGE_SIZE) {
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
+ ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
+ BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
/*
@@ -689,14 +690,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
atomic_inc(&cb->pending_bios);
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(root, inode,
- comp_bio, sums);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio,
+ sums);
BUG_ON(ret); /* -ENOMEM */
}
sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- root->sectorsize);
+ fs_info->sectorsize);
- ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
comp_bio->bi_error = ret;
bio_endio(comp_bio);
@@ -717,16 +718,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
bio_get(comp_bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
+ ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(root, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
comp_bio->bi_error = ret;
bio_endio(comp_bio);
@@ -959,9 +959,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
*
* disk_start is the starting logical offset of this array in the file
*
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
+ * orig_bio contains the pages from the file that we want to decompress into
*
* srclen is the number of bytes in pages_in
*
@@ -970,18 +968,18 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
* be contiguous. They all correspond to the range of bytes covered by
* the compressed extent.
*/
-static int btrfs_decompress_biovec(int type, struct page **pages_in,
- u64 disk_start, struct bio_vec *bvec,
- int vcnt, size_t srclen)
+static int btrfs_decompress_bio(int type, struct page **pages_in,
+ u64 disk_start, struct bio *orig_bio,
+ size_t srclen)
{
struct list_head *workspace;
int ret;
workspace = find_workspace(type);
- ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
- disk_start,
- bvec, vcnt, srclen);
+ ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in,
+ disk_start, orig_bio,
+ srclen);
free_workspace(type, workspace);
return ret;
}
@@ -1021,23 +1019,22 @@ void btrfs_exit_compress(void)
*/
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
- struct bio_vec *bvec, int vcnt,
- unsigned long *pg_index,
- unsigned long *pg_offset)
+ struct bio *bio)
{
unsigned long buf_offset;
unsigned long current_buf_start;
unsigned long start_byte;
+ unsigned long prev_start_byte;
unsigned long working_bytes = total_out - buf_start;
unsigned long bytes;
char *kaddr;
- struct page *page_out = bvec[*pg_index].bv_page;
+ struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
/*
* start byte is the first byte of the page we're currently
* copying into relative to the start of the compressed data.
*/
- start_byte = page_offset(page_out) - disk_start;
+ start_byte = page_offset(bvec.bv_page) - disk_start;
/* we haven't yet hit data corresponding to this page */
if (total_out <= start_byte)
@@ -1057,29 +1054,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
- bytes = min(PAGE_SIZE - *pg_offset,
- PAGE_SIZE - buf_offset);
+ bytes = min_t(unsigned long, bvec.bv_len,
+ PAGE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
- kaddr = kmap_atomic(page_out);
- memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+
+ kaddr = kmap_atomic(bvec.bv_page);
+ memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes);
kunmap_atomic(kaddr);
- flush_dcache_page(page_out);
+ flush_dcache_page(bvec.bv_page);
- *pg_offset += bytes;
buf_offset += bytes;
working_bytes -= bytes;
current_buf_start += bytes;
/* check if we need to pick another page */
- if (*pg_offset == PAGE_SIZE) {
- (*pg_index)++;
- if (*pg_index >= vcnt)
- return 0;
-
- page_out = bvec[*pg_index].bv_page;
- *pg_offset = 0;
- start_byte = page_offset(page_out) - disk_start;
+ bio_advance(bio, bytes);
+ if (!bio->bi_iter.bi_size)
+ return 0;
+ bvec = bio_iter_iovec(bio, bio->bi_iter);
+ prev_start_byte = start_byte;
+ start_byte = page_offset(bvec.bv_page) - disk_start;
+ /*
+ * We need to make sure we're only adjusting
+ * our offset into compression working buffer when
+ * we're switching pages. Otherwise we can incorrectly
+ * keep copying when we were actually done.
+ */
+ if (start_byte != prev_start_byte) {
/*
* make sure our new page is covered by this
* working buffer
@@ -1103,34 +1105,3 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
return 1;
}
-
-/*
- * When uncompressing data, we need to make sure and zero any parts of
- * the biovec that were not filled in by the decompression code. pg_index
- * and pg_offset indicate the last page and the last offset of that page
- * that have been filled in. This will zero everything remaining in the
- * biovec.
- */
-void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
- unsigned long pg_index,
- unsigned long pg_offset)
-{
- while (pg_index < vcnt) {
- struct page *page = bvec[pg_index].bv_page;
- unsigned long off = bvec[pg_index].bv_offset;
- unsigned long len = bvec[pg_index].bv_len;
-
- if (pg_offset < off)
- pg_offset = off;
- if (pg_offset < off + len) {
- unsigned long bytes = off + len - pg_offset;
- char *kaddr;
-
- kaddr = kmap_atomic(page);
- memset(kaddr + pg_offset, 0, bytes);
- kunmap_atomic(kaddr);
- }
- pg_index++;
- pg_offset = 0;
- }
-}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index f49d8b8c0f00..09879579fbc8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,9 +34,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
- struct bio_vec *bvec, int vcnt,
- unsigned long *pg_index,
- unsigned long *pg_offset);
+ struct bio *bio);
int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
@@ -45,9 +43,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long nr_pages);
int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
- unsigned long pg_index,
- unsigned long pg_offset);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -72,11 +67,10 @@ struct btrfs_compress_op {
unsigned long *total_out,
unsigned long max_out);
- int (*decompress_biovec)(struct list_head *workspace,
+ int (*decompress_bio)(struct list_head *workspace,
struct page **pages_in,
u64 disk_start,
- struct bio_vec *bvec,
- int vcnt,
+ struct bio *orig_bio,
size_t srclen);
int (*decompress)(struct list_head *workspace,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6ba165d3f81..a426dc822d4d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -32,10 +32,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size, int extend);
static int push_node_left(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *dst,
+ struct btrfs_fs_info *fs_info,
+ struct extent_buffer *dst,
struct extent_buffer *src, int empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
@@ -212,21 +213,23 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
!test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
return;
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
- &root->fs_info->dirty_cowonly_roots);
+ &fs_info->dirty_cowonly_roots);
else
list_move(&root->dirty_list,
- &root->fs_info->dirty_cowonly_roots);
+ &fs_info->dirty_cowonly_roots);
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
}
/*
@@ -239,13 +242,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cow;
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
- trans->transid != root->fs_info->running_transaction->transid);
+ trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
trans->transid != root->last_trans);
@@ -260,7 +264,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
if (IS_ERR(cow))
return PTR_ERR(cow);
- copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ copy_extent_buffer_full(cow, buf);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
@@ -271,8 +275,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, new_root_objectid);
- write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
- BTRFS_FSID_SIZE);
+ write_extent_buffer_fsid(cow, fs_info->fsid);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
@@ -978,6 +981,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
struct extent_buffer *cow,
int *last_ref)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 refs;
u64 owner;
u64 flags;
@@ -1002,14 +1006,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
*/
if (btrfs_block_can_be_shared(root, buf)) {
- ret = btrfs_lookup_extent_info(trans, root, buf->start,
+ ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
&refs, &flags);
if (ret)
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
} else {
@@ -1052,7 +1056,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans, root,
+ ret = btrfs_set_disk_extent_flags(trans, fs_info,
buf->start,
buf->len,
new_flags, level, 0);
@@ -1070,7 +1074,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
- clean_tree_block(trans, root->fs_info, buf);
+ clean_tree_block(trans, fs_info, buf);
*last_ref = 1;
}
return 0;
@@ -1095,6 +1099,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret,
u64 search_start, u64 empty_size)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
int level, ret;
@@ -1108,7 +1113,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
- trans->transid != root->fs_info->running_transaction->transid);
+ trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
trans->transid != root->last_trans);
@@ -1130,7 +1135,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
/* cow is set to blocking by btrfs_init_new_buffer */
- copy_extent_buffer(cow, buf, 0, 0, cow->len);
+ copy_extent_buffer_full(cow, buf);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
@@ -1141,8 +1146,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, root->root_key.objectid);
- write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
- BTRFS_FSID_SIZE);
+ write_extent_buffer_fsid(cow, fs_info->fsid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
@@ -1174,7 +1178,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+ tree_mod_log_insert_key(fs_info, parent, parent_slot,
MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -1182,7 +1186,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(root->fs_info, buf);
+ ret = tree_mod_log_free_eb(fs_info, buf);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1359,8 +1363,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
- eb->len);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1388,7 +1391,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb_rewin;
}
@@ -1403,6 +1406,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
static inline struct extent_buffer *
get_old_root(struct btrfs_root *root, u64 time_seq)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
@@ -1412,7 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
u64 logical;
eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+ tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
if (!tm)
return eb_root;
@@ -1424,16 +1428,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
logical = eb_root->start;
}
- tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+ tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(root, logical, 0);
+ old = read_tree_block(fs_info, logical, 0);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
- btrfs_warn(root->fs_info,
- "failed to read tree block %llu from get_old_root", logical);
+ btrfs_warn(fs_info,
+ "failed to read tree block %llu from get_old_root",
+ logical);
} else {
eb = btrfs_clone_extent_buffer(old);
free_extent_buffer(old);
@@ -1441,8 +1446,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(root->fs_info, logical,
- root->nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -1462,10 +1466,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_generation(eb, old_generation);
}
if (tm)
- __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
+ __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb;
}
@@ -1527,17 +1531,18 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
int ret;
- if (trans->transaction != root->fs_info->running_transaction)
+ if (trans->transaction != fs_info->running_transaction)
WARN(1, KERN_CRIT "trans %llu running %llu\n",
trans->transid,
- root->fs_info->running_transaction->transid);
+ fs_info->running_transaction->transid);
- if (trans->transid != root->fs_info->generation)
+ if (trans->transid != fs_info->generation)
WARN(1, KERN_CRIT "trans %llu running %llu\n",
- trans->transid, root->fs_info->generation);
+ trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
trans->dirty = true;
@@ -1614,6 +1619,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
int start_slot, u64 *last_ret,
struct btrfs_key *progress)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
u64 gen;
@@ -1632,11 +1638,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
parent_level = btrfs_header_level(parent);
- WARN_ON(trans->transaction != root->fs_info->running_transaction);
- WARN_ON(trans->transid != root->fs_info->generation);
+ WARN_ON(trans->transaction != fs_info->running_transaction);
+ WARN_ON(trans->transid != fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
- blocksize = root->nodesize;
+ blocksize = fs_info->nodesize;
end_slot = parent_nritems - 1;
if (parent_nritems <= 1)
@@ -1670,14 +1676,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root->fs_info, blocknr);
+ cur = find_extent_buffer(fs_info, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(root, blocknr, gen);
+ cur = read_tree_block(fs_info, blocknr, gen);
if (IS_ERR(cur)) {
return PTR_ERR(cur);
} else if (!extent_buffer_uptodate(cur)) {
@@ -1715,7 +1721,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return err;
}
-
/*
* search for key in the extent_buffer. The items start at offset p,
* and they are item_size apart. There are 'max' items in p.
@@ -1839,8 +1844,9 @@ static void root_sub_used(struct btrfs_root *root, u32 size)
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
*/
-static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
- struct extent_buffer *parent, int slot)
+static noinline struct extent_buffer *
+read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
+ int slot)
{
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
@@ -1850,7 +1856,7 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
BUG_ON(level == 0);
- eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+ eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
@@ -1869,6 +1875,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = NULL;
struct extent_buffer *mid;
struct extent_buffer *left = NULL;
@@ -1906,10 +1913,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
/* promote the child to a root */
- child = read_node_slot(root, mid, 0);
+ child = read_node_slot(fs_info, mid, 0);
if (IS_ERR(child)) {
ret = PTR_ERR(child);
- btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto enospc;
}
@@ -1930,7 +1937,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
path->nodes[level] = NULL;
- clean_tree_block(trans, root->fs_info, mid);
+ clean_tree_block(trans, fs_info, mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
@@ -1942,10 +1949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
}
if (btrfs_header_nritems(mid) >
- BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
return 0;
- left = read_node_slot(root, parent, pslot - 1);
+ left = read_node_slot(fs_info, parent, pslot - 1);
if (IS_ERR(left))
left = NULL;
@@ -1960,7 +1967,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
}
- right = read_node_slot(root, parent, pslot + 1);
+ right = read_node_slot(fs_info, parent, pslot + 1);
if (IS_ERR(right))
right = NULL;
@@ -1978,7 +1985,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* first, try to make some room in the middle buffer */
if (left) {
orig_slot += btrfs_header_nritems(left);
- wret = push_node_left(trans, root, left, mid, 1);
+ wret = push_node_left(trans, fs_info, left, mid, 1);
if (wret < 0)
ret = wret;
}
@@ -1987,11 +1994,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
* then try to empty the right most buffer into the middle
*/
if (right) {
- wret = push_node_left(trans, root, mid, right, 1);
+ wret = push_node_left(trans, fs_info, mid, right, 1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- clean_tree_block(trans, root->fs_info, right);
+ clean_tree_block(trans, fs_info, right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
@@ -2001,7 +2008,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent,
+ tree_mod_log_set_node_key(fs_info, parent,
pslot + 1, 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2019,23 +2026,23 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto enospc;
}
- wret = balance_node_right(trans, root, mid, left);
+ wret = balance_node_right(trans, fs_info, mid, left);
if (wret < 0) {
ret = wret;
goto enospc;
}
if (wret == 1) {
- wret = push_node_left(trans, root, left, mid, 1);
+ wret = push_node_left(trans, fs_info, left, mid, 1);
if (wret < 0)
ret = wret;
}
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- clean_tree_block(trans, root->fs_info, mid);
+ clean_tree_block(trans, fs_info, mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
@@ -2046,8 +2053,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent,
- pslot, 0);
+ tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -2094,6 +2100,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = NULL;
struct extent_buffer *mid;
struct extent_buffer *left = NULL;
@@ -2117,7 +2124,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (!parent)
return 1;
- left = read_node_slot(root, parent, pslot - 1);
+ left = read_node_slot(fs_info, parent, pslot - 1);
if (IS_ERR(left))
left = NULL;
@@ -2129,7 +2136,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(left);
left_nr = btrfs_header_nritems(left);
- if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
@@ -2137,7 +2144,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (ret)
wret = 1;
else {
- wret = push_node_left(trans, root,
+ wret = push_node_left(trans, fs_info,
left, mid, 0);
}
}
@@ -2147,8 +2154,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent,
- pslot, 0);
+ tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -2169,7 +2175,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(left);
free_extent_buffer(left);
}
- right = read_node_slot(root, parent, pslot + 1);
+ right = read_node_slot(fs_info, parent, pslot + 1);
if (IS_ERR(right))
right = NULL;
@@ -2183,7 +2189,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(right);
right_nr = btrfs_header_nritems(right);
- if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+ if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, right,
@@ -2192,7 +2198,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (ret)
wret = 1;
else {
- wret = balance_node_right(trans, root,
+ wret = balance_node_right(trans, fs_info,
right, mid);
}
}
@@ -2202,7 +2208,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- tree_mod_log_set_node_key(root->fs_info, parent,
+ tree_mod_log_set_node_key(fs_info, parent,
pslot + 1, 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2230,7 +2236,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* readahead one full node of leaves, finding things that are close
* to the block in 'slot', and triggering ra on them.
*/
-static void reada_for_search(struct btrfs_root *root,
+static void reada_for_search(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int level, int slot, u64 objectid)
{
@@ -2254,8 +2260,8 @@ static void reada_for_search(struct btrfs_root *root,
node = path->nodes[level];
search = btrfs_node_blockptr(node, slot);
- blocksize = root->nodesize;
- eb = btrfs_find_tree_block(root->fs_info, search);
+ blocksize = fs_info->nodesize;
+ eb = find_extent_buffer(fs_info, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2284,7 +2290,7 @@ static void reada_for_search(struct btrfs_root *root,
search = btrfs_node_blockptr(node, nr);
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(root, search);
+ readahead_tree_block(fs_info, search);
nread += blocksize;
}
nscan++;
@@ -2293,7 +2299,7 @@ static void reada_for_search(struct btrfs_root *root,
}
}
-static noinline void reada_for_balance(struct btrfs_root *root,
+static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, int level)
{
int slot;
@@ -2314,7 +2320,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root->fs_info, block1);
+ eb = find_extent_buffer(fs_info, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2327,16 +2333,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root->fs_info, block2);
+ eb = find_extent_buffer(fs_info, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
}
if (block1)
- readahead_tree_block(root, block1);
+ readahead_tree_block(fs_info, block1);
if (block2)
- readahead_tree_block(root, block2);
+ readahead_tree_block(fs_info, block2);
}
@@ -2436,6 +2442,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
struct extent_buffer **eb_ret, int level, int slot,
struct btrfs_key *key, u64 time_seq)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
struct extent_buffer *b = *eb_ret;
@@ -2445,7 +2452,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- tmp = btrfs_find_tree_block(root->fs_info, blocknr);
+ tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2484,12 +2491,12 @@ read_block_for_search(struct btrfs_trans_handle *trans,
free_extent_buffer(tmp);
if (p->reada != READA_NONE)
- reada_for_search(root, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, level, slot, key->objectid);
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, 0);
+ tmp = read_tree_block(fs_info, blocknr, 0);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2521,9 +2528,11 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
struct extent_buffer *b, int level, int ins_len,
int *write_lock_level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
+
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
- BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
int sret;
if (*write_lock_level < level + 1) {
@@ -2533,7 +2542,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
}
btrfs_set_path_blocking(p);
- reada_for_balance(root, p, level);
+ reada_for_balance(fs_info, p, level);
sret = split_node(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2544,7 +2553,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
}
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
- BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
int sret;
if (*write_lock_level < level + 1) {
@@ -2554,7 +2563,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
}
btrfs_set_path_blocking(p);
- reada_for_balance(root, p, level);
+ reada_for_balance(fs_info, p, level);
sret = balance_level(trans, root, p, level);
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2663,6 +2672,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -2718,12 +2728,12 @@ again:
* so we always do read locks
*/
if (p->need_commit_sem)
- down_read(&root->fs_info->commit_root_sem);
+ down_read(&fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
if (p->need_commit_sem)
- up_read(&root->fs_info->commit_root_sem);
+ up_read(&fs_info->commit_root_sem);
if (!p->skip_locking)
btrfs_tree_read_lock(b);
} else {
@@ -2895,7 +2905,7 @@ cow_done:
} else {
p->slots[level] = slot;
if (ins_len > 0 &&
- btrfs_leaf_free_space(root, b) < ins_len) {
+ btrfs_leaf_free_space(fs_info, b) < ins_len) {
if (write_lock_level < 1) {
write_lock_level = 1;
btrfs_release_path(p);
@@ -2946,6 +2956,7 @@ done:
int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *p, u64 time_seq)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -3020,7 +3031,7 @@ again:
btrfs_clear_path_blocking(p, b,
BTRFS_READ_LOCK);
}
- b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -3187,7 +3198,8 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
* error, and > 0 if there was no room in the left hand block.
*/
static int push_node_left(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *dst,
+ struct btrfs_fs_info *fs_info,
+ struct extent_buffer *dst,
struct extent_buffer *src, int empty)
{
int push_items = 0;
@@ -3197,7 +3209,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
src_nritems = btrfs_header_nritems(src);
dst_nritems = btrfs_header_nritems(dst);
- push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
WARN_ON(btrfs_header_generation(src) != trans->transid);
WARN_ON(btrfs_header_generation(dst) != trans->transid);
@@ -3222,7 +3234,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
- ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3261,7 +3273,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
* this will only push up to 1/2 the contents of the left node over
*/
static int balance_node_right(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct extent_buffer *dst,
struct extent_buffer *src)
{
@@ -3276,7 +3288,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
src_nritems = btrfs_header_nritems(src);
dst_nritems = btrfs_header_nritems(dst);
- push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+ push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
if (push_items <= 0)
return 1;
@@ -3291,13 +3303,13 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
- tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
+ tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
src_nritems - push_items, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3328,6 +3340,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 lower_gen;
struct extent_buffer *lower;
struct extent_buffer *c;
@@ -3348,9 +3361,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (IS_ERR(c))
return PTR_ERR(c);
- root_add_used(root, root->nodesize);
+ root_add_used(root, fs_info->nodesize);
- memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(c, 0, sizeof(struct btrfs_header));
btrfs_set_header_nritems(c, 1);
btrfs_set_header_level(c, level);
btrfs_set_header_bytenr(c, c->start);
@@ -3358,11 +3371,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(c, root->root_key.objectid);
- write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
- BTRFS_FSID_SIZE);
-
- write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
+ write_extent_buffer_fsid(c, fs_info->fsid);
+ write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid);
btrfs_set_node_key(c, &lower_key, 0);
btrfs_set_node_blockptr(c, 0, lower->start);
@@ -3396,7 +3406,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* blocknr is the block the key points to.
*/
static void insert_ptr(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_fs_info *fs_info, struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
@@ -3409,10 +3419,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
- BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
if (slot != nritems) {
if (level)
- tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+ tree_mod_log_eb_move(fs_info, lower, slot + 1,
slot, nritems - slot);
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
@@ -3420,7 +3430,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+ ret = tree_mod_log_insert_key(fs_info, lower, slot,
MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -3445,6 +3455,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *c;
struct extent_buffer *split;
struct btrfs_disk_key disk_key;
@@ -3472,7 +3483,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
- BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
return 0;
if (ret < 0)
return ret;
@@ -3487,22 +3498,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
if (IS_ERR(split))
return PTR_ERR(split);
- root_add_used(root, root->nodesize);
+ root_add_used(root, fs_info->nodesize);
- memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(split, 0, sizeof(struct btrfs_header));
btrfs_set_header_level(split, btrfs_header_level(c));
btrfs_set_header_bytenr(split, split->start);
btrfs_set_header_generation(split, trans->transid);
btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(split, root->root_key.objectid);
- write_extent_buffer(split, root->fs_info->fsid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE);
- write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(split),
- BTRFS_UUID_SIZE);
-
- ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
- mid, c_nritems - mid);
+ write_extent_buffer_fsid(split, fs_info->fsid);
+ write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid);
+
+ ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3518,7 +3525,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
- insert_ptr(trans, root, path, &disk_key, split->start,
+ insert_ptr(trans, fs_info, path, &disk_key, split->start,
path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
@@ -3566,17 +3573,19 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
* the start of the leaf data. IOW, how much room
* the leaf has left for both items and data
*/
-noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf)
{
int nritems = btrfs_header_nritems(leaf);
int ret;
- ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+
+ ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
if (ret < 0) {
- btrfs_crit(root->fs_info,
- "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
- ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
- leaf_space_used(leaf, 0, nritems), nritems);
+ btrfs_crit(fs_info,
+ "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
+ ret,
+ (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
+ leaf_space_used(leaf, 0, nritems), nritems);
}
return ret;
}
@@ -3586,7 +3595,7 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
* right. We'll push up to and including min_slot, but no lower
*/
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
@@ -3626,7 +3635,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (path->slots[0] > i)
break;
if (path->slots[0] == i) {
- int space = btrfs_leaf_free_space(root, left);
+ int space = btrfs_leaf_free_space(fs_info, left);
if (space + push_space * 2 > free_space)
break;
}
@@ -3655,19 +3664,19 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
right_nritems = btrfs_header_nritems(right);
push_space = btrfs_item_end_nr(left, left_nritems - push_items);
- push_space -= leaf_data_end(root, left);
+ push_space -= leaf_data_end(fs_info, left);
/* make room in the right data area */
- data_end = leaf_data_end(root, right);
+ data_end = leaf_data_end(fs_info, right);
memmove_extent_buffer(right,
btrfs_leaf_data(right) + data_end - push_space,
btrfs_leaf_data(right) + data_end,
- BTRFS_LEAF_DATA_SIZE(root) - data_end);
+ BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
copy_extent_buffer(right, left, btrfs_leaf_data(right) +
- BTRFS_LEAF_DATA_SIZE(root) - push_space,
- btrfs_leaf_data(left) + leaf_data_end(root, left),
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
+ btrfs_leaf_data(left) + leaf_data_end(fs_info, left),
push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3682,7 +3691,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
/* update the item pointers */
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
- push_space = BTRFS_LEAF_DATA_SIZE(root);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
push_space -= btrfs_token_item_size(right, item, &token);
@@ -3695,7 +3704,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
- clean_tree_block(trans, root->fs_info, left);
+ clean_tree_block(trans, fs_info, left);
btrfs_mark_buffer_dirty(right);
@@ -3707,7 +3716,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root->fs_info, path->nodes[0]);
+ clean_tree_block(trans, fs_info, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3739,6 +3748,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
int min_data_size, int data_size,
int empty, u32 min_slot)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
struct extent_buffer *upper;
@@ -3757,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
- right = read_node_slot(root, upper, slot + 1);
+ right = read_node_slot(fs_info, upper, slot + 1);
/*
* slot + 1 is not valid or we fail to read the right node,
* no big deal, just return.
@@ -3768,7 +3778,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
- free_space = btrfs_leaf_free_space(root, right);
+ free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
goto out_unlock;
@@ -3778,7 +3788,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(root, right);
+ free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
goto out_unlock;
@@ -3799,7 +3809,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 0;
}
- return __push_leaf_right(trans, root, path, min_data_size, empty,
+ return __push_leaf_right(trans, fs_info, path, min_data_size, empty,
right, free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
@@ -3816,7 +3826,7 @@ out_unlock:
* items
*/
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
@@ -3849,7 +3859,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
if (path->slots[0] < i)
break;
if (path->slots[0] == i) {
- int space = btrfs_leaf_free_space(root, right);
+ int space = btrfs_leaf_free_space(fs_info, right);
if (space + push_space * 2 > free_space)
break;
}
@@ -3878,11 +3888,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_item_nr_offset(0),
push_items * sizeof(struct btrfs_item));
- push_space = BTRFS_LEAF_DATA_SIZE(root) -
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset_nr(right, push_items - 1);
copy_extent_buffer(left, right, btrfs_leaf_data(left) +
- leaf_data_end(root, left) - push_space,
+ leaf_data_end(fs_info, left) - push_space,
btrfs_leaf_data(right) +
btrfs_item_offset_nr(right, push_items - 1),
push_space);
@@ -3897,7 +3907,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
ioff = btrfs_token_item_offset(left, item, &token);
btrfs_set_token_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
&token);
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -3909,11 +3919,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
if (push_items < right_nritems) {
push_space = btrfs_item_offset_nr(right, push_items - 1) -
- leaf_data_end(root, right);
+ leaf_data_end(fs_info, right);
memmove_extent_buffer(right, btrfs_leaf_data(right) +
- BTRFS_LEAF_DATA_SIZE(root) - push_space,
+ BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
btrfs_leaf_data(right) +
- leaf_data_end(root, right), push_space);
+ leaf_data_end(fs_info, right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(push_items),
@@ -3922,7 +3932,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
}
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
- push_space = BTRFS_LEAF_DATA_SIZE(root);
+ push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
@@ -3935,10 +3945,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
- clean_tree_block(trans, root->fs_info, right);
+ clean_tree_block(trans, fs_info, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -3972,6 +3982,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int min_data_size,
int data_size, int empty, u32 max_slot)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = path->nodes[0];
struct extent_buffer *left;
int slot;
@@ -3991,7 +4002,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
- left = read_node_slot(root, path->nodes[1], slot - 1);
+ left = read_node_slot(fs_info, path->nodes[1], slot - 1);
/*
* slot - 1 is not valid or we fail to read the left node,
* no big deal, just return.
@@ -4002,7 +4013,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
- free_space = btrfs_leaf_free_space(root, left);
+ free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
ret = 1;
goto out;
@@ -4018,13 +4029,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(root, left);
+ free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
ret = 1;
goto out;
}
- return __push_leaf_left(trans, root, path, min_data_size,
+ return __push_leaf_left(trans, fs_info, path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
out:
@@ -4038,7 +4049,7 @@ out:
* available for the resulting leaf level of the path.
*/
static noinline void copy_for_split(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *l,
struct extent_buffer *right,
@@ -4054,19 +4065,18 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
nritems * sizeof(struct btrfs_item));
copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) -
data_copy_size, btrfs_leaf_data(l) +
- leaf_data_end(root, l), data_copy_size);
+ leaf_data_end(fs_info, l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
- btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
for (i = 0; i < nritems; i++) {
struct btrfs_item *item = btrfs_item_nr(i);
@@ -4079,7 +4089,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
- insert_ptr(trans, root, path, &disk_key, right->start,
+ insert_ptr(trans, fs_info, path, &disk_key, right->start,
path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
@@ -4115,6 +4125,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
int data_size)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int progress = 0;
int slot;
@@ -4123,7 +4134,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
slot = path->slots[0];
if (slot < btrfs_header_nritems(path->nodes[0]))
- space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
+ space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
/*
* try to push all the items after our slot into the
@@ -4144,7 +4155,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
if (path->slots[0] == 0 || path->slots[0] == nritems)
return 0;
- if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
return 0;
/* try to push all the items before our slot into the next leaf */
@@ -4189,7 +4200,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
if (extend && data_size + btrfs_item_size_nr(l, slot) +
- sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
@@ -4197,7 +4208,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int space_needed = data_size;
if (slot < btrfs_header_nritems(l))
- space_needed -= btrfs_leaf_free_space(root, l);
+ space_needed -= btrfs_leaf_free_space(fs_info, l);
wret = push_leaf_right(trans, root, path, space_needed,
space_needed, 0, 0);
@@ -4212,7 +4223,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
/* did the pushes work? */
- if (btrfs_leaf_free_space(root, l) >= data_size)
+ if (btrfs_leaf_free_space(fs_info, l) >= data_size)
return 0;
}
@@ -4231,14 +4242,14 @@ again:
if (mid <= slot) {
if (nritems == 1 ||
leaf_space_used(l, mid, nritems - mid) + data_size >
- BTRFS_LEAF_DATA_SIZE(root)) {
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (slot >= nritems) {
split = 0;
} else {
mid = slot;
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
- data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (data_size && !tried_avoid_double)
goto push_for_double;
split = 2;
@@ -4247,7 +4258,7 @@ again:
}
} else {
if (leaf_space_used(l, 0, mid) + data_size >
- BTRFS_LEAF_DATA_SIZE(root)) {
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (!extend && data_size && slot == 0) {
split = 0;
} else if ((extend || !data_size) && slot == 0) {
@@ -4256,7 +4267,7 @@ again:
mid = slot;
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
- data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+ data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (data_size && !tried_avoid_double)
goto push_for_double;
split = 2;
@@ -4275,26 +4286,22 @@ again:
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, root->nodesize);
+ root_add_used(root, fs_info->nodesize);
- memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(right, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
btrfs_set_header_generation(right, trans->transid);
btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(right, root->root_key.objectid);
btrfs_set_header_level(right, 0);
- write_extent_buffer(right, fs_info->fsid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE);
-
- write_extent_buffer(right, fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(right),
- BTRFS_UUID_SIZE);
+ write_extent_buffer_fsid(right, fs_info->fsid);
+ write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid);
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
+ insert_ptr(trans, fs_info, path, &disk_key,
+ right->start, path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -4302,8 +4309,8 @@ again:
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
- insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1], 1);
+ insert_ptr(trans, fs_info, path, &disk_key,
+ right->start, path->slots[1], 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -4319,7 +4326,7 @@ again:
return ret;
}
- copy_for_split(trans, root, path, l, right, slot, mid, nritems);
+ copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
@@ -4332,7 +4339,7 @@ again:
push_for_double:
push_for_double_split(trans, root, path, data_size);
tried_avoid_double = 1;
- if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+ if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
return 0;
goto again;
}
@@ -4341,6 +4348,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int ins_len)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
@@ -4354,7 +4362,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
key.type != BTRFS_EXTENT_CSUM_KEY);
- if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+ if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
return 0;
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -4381,7 +4389,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
/* the leaf has changed, it now has room. return now */
- if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+ if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
goto err;
if (key.type == BTRFS_EXTENT_DATA_KEY) {
@@ -4405,7 +4413,7 @@ err:
}
static noinline int split_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *new_key,
unsigned long split_offset)
@@ -4421,7 +4429,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+ BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
btrfs_set_path_blocking(path);
@@ -4470,7 +4478,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
item_size - split_offset);
btrfs_mark_buffer_dirty(leaf);
- BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+ BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
kfree(buf);
return 0;
}
@@ -4502,7 +4510,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = split_item(trans, root, path, new_key, split_offset);
+ ret = split_item(trans, root->fs_info, path, new_key, split_offset);
return ret;
}
@@ -4548,8 +4556,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
- u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4572,7 +4580,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
return;
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
+ data_end = leaf_data_end(fs_info, leaf);
old_data_start = btrfs_item_offset_nr(leaf, slot);
@@ -4631,15 +4639,15 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
item = btrfs_item_nr(slot);
btrfs_set_item_size(leaf, item, new_size);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
+ if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ btrfs_print_leaf(fs_info, leaf);
BUG();
}
}
@@ -4647,7 +4655,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* make the item pointed to by the path bigger, data_size is the added size.
*/
-void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
u32 data_size)
{
int slot;
@@ -4665,10 +4673,10 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
+ data_end = leaf_data_end(fs_info, leaf);
- if (btrfs_leaf_free_space(root, leaf) < data_size) {
- btrfs_print_leaf(root, leaf);
+ if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
+ btrfs_print_leaf(fs_info, leaf);
BUG();
}
slot = path->slots[0];
@@ -4676,9 +4684,9 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
BUG_ON(slot < 0);
if (slot >= nritems) {
- btrfs_print_leaf(root, leaf);
- btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
- slot, nritems);
+ btrfs_print_leaf(fs_info, leaf);
+ btrfs_crit(fs_info, "slot %d too large, nritems %d",
+ slot, nritems);
BUG_ON(1);
}
@@ -4706,8 +4714,8 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_item_size(leaf, item, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
+ if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ btrfs_print_leaf(fs_info, leaf);
BUG();
}
}
@@ -4721,6 +4729,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *cpu_key, u32 *data_size,
u32 total_data, u32 total_size, int nr)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
int i;
u32 nritems;
@@ -4732,7 +4741,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4742,13 +4751,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
- data_end = leaf_data_end(root, leaf);
+ data_end = leaf_data_end(fs_info, leaf);
- if (btrfs_leaf_free_space(root, leaf) < total_size) {
- btrfs_print_leaf(root, leaf);
- btrfs_crit(root->fs_info,
- "not enough freespace need %u have %d",
- total_size, btrfs_leaf_free_space(root, leaf));
+ if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
+ btrfs_print_leaf(fs_info, leaf);
+ btrfs_crit(fs_info, "not enough freespace need %u have %d",
+ total_size, btrfs_leaf_free_space(fs_info, leaf));
BUG();
}
@@ -4756,9 +4764,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
if (old_data < data_end) {
- btrfs_print_leaf(root, leaf);
- btrfs_crit(root->fs_info,
- "slot %d old_data %d data_end %d",
+ btrfs_print_leaf(fs_info, leaf);
+ btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
slot, old_data, data_end);
BUG_ON(1);
}
@@ -4800,8 +4807,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
- if (btrfs_leaf_free_space(root, leaf) < 0) {
- btrfs_print_leaf(root, leaf);
+ if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
+ btrfs_print_leaf(fs_info, leaf);
BUG();
}
}
@@ -4876,6 +4883,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
int ret;
@@ -4883,7 +4891,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level)
- tree_mod_log_eb_move(root->fs_info, parent, slot,
+ tree_mod_log_eb_move(fs_info, parent, slot,
slot + 1, nritems - slot - 1);
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
@@ -4891,7 +4899,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ ret = tree_mod_log_insert_key(fs_info, parent, slot,
MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -4906,7 +4914,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
+ fixup_low_keys(fs_info, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
}
@@ -4948,6 +4956,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 last_off;
@@ -4969,7 +4978,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(root, leaf);
+ int data_end = leaf_data_end(fs_info, leaf);
memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
data_end + dsize,
@@ -4999,7 +5008,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_level(leaf, 0);
} else {
btrfs_set_path_blocking(path);
- clean_tree_block(trans, root->fs_info, leaf);
+ clean_tree_block(trans, fs_info, leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
@@ -5008,11 +5017,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(root->fs_info, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -5132,6 +5141,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path,
u64 min_trans)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
struct btrfs_key found_key;
int slot;
@@ -5208,7 +5218,7 @@ find_next_key:
goto out;
}
btrfs_set_path_blocking(path);
- cur = read_node_slot(root, cur, slot);
+ cur = read_node_slot(fs_info, cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
goto out;
@@ -5231,14 +5241,14 @@ out:
return ret;
}
-static int tree_move_down(struct btrfs_root *root,
+static int tree_move_down(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int *level, int root_level)
{
struct extent_buffer *eb;
BUG_ON(*level == 0);
- eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
+ eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
if (IS_ERR(eb))
return PTR_ERR(eb);
@@ -5248,7 +5258,7 @@ static int tree_move_down(struct btrfs_root *root,
return 0;
}
-static int tree_move_next_or_upnext(struct btrfs_root *root,
+static int tree_move_next_or_upnext(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int *level, int root_level)
{
@@ -5279,7 +5289,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
* Returns 1 if it had to move up and next. 0 is returned if it moved only next
* or down.
*/
-static int tree_advance(struct btrfs_root *root,
+static int tree_advance(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int *level, int root_level,
int allow_down,
@@ -5288,9 +5298,10 @@ static int tree_advance(struct btrfs_root *root,
int ret;
if (*level == 0 || !allow_down) {
- ret = tree_move_next_or_upnext(root, path, level, root_level);
+ ret = tree_move_next_or_upnext(fs_info, path, level,
+ root_level);
} else {
- ret = tree_move_down(root, path, level, root_level);
+ ret = tree_move_down(fs_info, path, level, root_level);
}
if (ret >= 0) {
if (*level == 0)
@@ -5303,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root,
return ret;
}
-static int tree_compare_item(struct btrfs_root *left_root,
- struct btrfs_path *left_path,
+static int tree_compare_item(struct btrfs_path *left_path,
struct btrfs_path *right_path,
char *tmp_buf)
{
@@ -5349,6 +5359,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
struct btrfs_root *right_root,
btrfs_changed_cb_t changed_cb, void *ctx)
{
+ struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
int cmp;
struct btrfs_path *left_path = NULL;
@@ -5380,9 +5391,9 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!tmp_buf) {
- tmp_buf = vmalloc(left_root->nodesize);
+ tmp_buf = vmalloc(fs_info->nodesize);
if (!tmp_buf) {
ret = -ENOMEM;
goto out;
@@ -5430,7 +5441,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
* the right if possible or go up and right.
*/
- down_read(&left_root->fs_info->commit_root_sem);
+ down_read(&fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
left_path->nodes[left_level] = left_root->commit_root;
@@ -5440,7 +5451,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level = right_level;
right_path->nodes[right_level] = right_root->commit_root;
extent_buffer_get(right_path->nodes[right_level]);
- up_read(&left_root->fs_info->commit_root_sem);
+ up_read(&fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5460,7 +5471,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
while (1) {
if (advance_left && !left_end_reached) {
- ret = tree_advance(left_root, left_path, &left_level,
+ ret = tree_advance(fs_info, left_path, &left_level,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
&left_key);
@@ -5471,7 +5482,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = 0;
}
if (advance_right && !right_end_reached) {
- ret = tree_advance(right_root, right_path, &right_level,
+ ret = tree_advance(fs_info, right_path, &right_level,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
&right_key);
@@ -5535,8 +5546,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
enum btrfs_compare_tree_result result;
WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
- ret = tree_compare_item(left_root, left_path,
- right_path, tmp_buf);
+ ret = tree_compare_item(left_path, right_path,
+ tmp_buf);
if (ret)
result = BTRFS_COMPARE_TREE_CHANGED;
else
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0b8ce2b9f7d0..6a823719b6c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -90,9 +90,6 @@ static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
-/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */
-#define REQ_GET_READ_MIRRORS (1 << 30)
-
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
@@ -340,7 +337,7 @@ struct btrfs_path {
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
@@ -429,6 +426,10 @@ struct btrfs_space_info {
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+ /*
+ * tickets_id just indicates the next ticket will be handled, so note
+ * it's not stored per ticket.
+ */
u64 tickets_id;
struct rw_semaphore groups_sem;
@@ -518,7 +519,7 @@ struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
struct page **pages;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
struct inode *inode;
unsigned long size;
int index;
@@ -798,7 +799,6 @@ struct btrfs_fs_info {
spinlock_t super_lock;
struct btrfs_super_block *super_copy;
struct btrfs_super_block *super_for_commit;
- struct block_device *__bdev;
struct super_block *sb;
struct inode *btree_inode;
struct backing_dev_info bdi;
@@ -1084,8 +1084,18 @@ struct btrfs_fs_info {
/* Used to record internally whether fs has been frozen */
int fs_frozen;
+
+ /* Cached block sizes */
+ u32 nodesize;
+ u32 sectorsize;
+ u32 stripesize;
};
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
struct btrfs_subvolume_writers {
struct percpu_counter counter;
wait_queue_head_t wait;
@@ -1159,14 +1169,6 @@ struct btrfs_root {
u64 objectid;
u64 last_trans;
- /* data allocations are done in sectorsize units */
- u32 sectorsize;
-
- /* node allocations are done in nodesize units */
- u32 nodesize;
-
- u32 stripesize;
-
u32 type;
u64 highest_objectid;
@@ -1250,38 +1252,42 @@ struct btrfs_root {
/* For qgroup metadata space reserve */
atomic_t qgroup_meta_rsv;
};
+static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
+{
+ return btrfs_sb(inode->i_sb)->sectorsize;
+}
static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
{
return blocksize - sizeof(struct btrfs_header);
}
-static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
+static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
- return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
+ return __BTRFS_LEAF_DATA_SIZE(info->nodesize);
}
-static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
+static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
- return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
}
-static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
+static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
{
- return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
+ return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
}
#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
(offsetof(struct btrfs_file_extent_item, disk_bytenr))
-static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
{
- return BTRFS_MAX_ITEM_SIZE(root) -
+ return BTRFS_MAX_ITEM_SIZE(info) -
BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
-static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
+static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
{
- return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
+ return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
/*
@@ -1343,12 +1349,13 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
#ifdef CONFIG_BTRFS_DEBUG
static inline int
-btrfs_should_fragment_free_space(struct btrfs_root *root,
- struct btrfs_block_group_cache *block_group)
+btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group)
{
- return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+
+ return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
+ (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
#endif
@@ -2210,6 +2217,8 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
cpu->target = le64_to_cpu(disk->target);
cpu->flags = le64_to_cpu(disk->flags);
cpu->limit = le64_to_cpu(disk->limit);
+ cpu->stripes_min = le32_to_cpu(disk->stripes_min);
+ cpu->stripes_max = le32_to_cpu(disk->stripes_max);
}
static inline void
@@ -2228,6 +2237,8 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
disk->target = cpu_to_le64(cpu->target);
disk->flags = cpu_to_le64(cpu->flags);
disk->limit = cpu_to_le64(cpu->limit);
+ disk->stripes_min = cpu_to_le32(cpu->stripes_min);
+ disk->stripes_max = cpu_to_le32(cpu->stripes_max);
}
/* struct btrfs_super_block */
@@ -2299,13 +2310,13 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
* this returns the address of the start of the last item,
* which is the stop of the leaf data stack
*/
-static inline unsigned int leaf_data_end(struct btrfs_root *root,
+static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf)
{
u32 nr = btrfs_header_nritems(leaf);
if (nr == 0)
- return BTRFS_LEAF_DATA_SIZE(root);
+ return BTRFS_LEAF_DATA_SIZE(fs_info);
return btrfs_item_offset_nr(leaf, nr - 1);
}
@@ -2501,11 +2512,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
struct btrfs_dev_replace_item, cursor_right, 64);
-static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_leaf_data(leaf) + \
@@ -2528,28 +2534,28 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
/* extent-tree.c */
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
-static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
- return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+ return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}
/*
* Doing a truncate won't result in new nodes or leaves, just what we need for
* COW.
*/
-static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
- return root->nodesize * BTRFS_MAX_LEVEL * num_items;
+ return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2558,18 +2564,18 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long count);
-int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
-int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_root *root,
+int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes);
-int btrfs_exclude_logged_extents(struct btrfs_root *root,
+int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2590,12 +2596,11 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
u64 root_objectid, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
@@ -2606,52 +2611,52 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 flags,
int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
- int delalloc);
-int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+ struct btrfs_fs_info *fs_info);
+int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
-int btrfs_read_block_groups(struct btrfs_root *root);
-int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
+int btrfs_read_block_groups(struct btrfs_fs_info *info);
+int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytes_used,
+ struct btrfs_fs_info *fs_info, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start,
+ struct btrfs_fs_info *fs_info, u64 group_start,
struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -2681,7 +2686,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
@@ -2690,7 +2695,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems,
u64 *qgroup_reserved, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
@@ -2698,16 +2703,15 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
-void btrfs_free_block_rsv(struct btrfs_root *root,
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
-int btrfs_block_rsv_check(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
@@ -2717,22 +2721,21 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
-void btrfs_block_rsv_release(struct btrfs_root *root,
+void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-void btrfs_dec_block_group_ro(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache);
+void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
-int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
-int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type);
-int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+ struct btrfs_fs_info *fs_info, u64 type);
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
@@ -2742,8 +2745,7 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const u64 type);
+ struct btrfs_fs_info *fs_info, const u64 type);
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *info, u64 start, u64 end);
@@ -2793,10 +2795,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct extent_buffer **cow_ret, u64 new_root_objectid);
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf);
-void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
u32 data_size);
-void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
- u32 new_size, int from_end);
+void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2872,7 +2874,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
}
-int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
int update_ref, int for_reloc);
@@ -2898,10 +2901,9 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* anything except sleeping. This function is used to check the status of
* the fs.
*/
-static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
- return (root->fs_info->sb->s_flags & MS_RDONLY ||
- btrfs_fs_closing(root->fs_info));
+ return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info);
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
@@ -2931,11 +2933,11 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len);
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len);
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2950,7 +2952,7 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key);
-int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
@@ -2959,10 +2961,10 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
/* uuid-tree.c */
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@ -3004,10 +3006,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-int verify_dir_item(struct btrfs_root *root,
+int verify_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
@@ -3051,11 +3053,10 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr, u64 len);
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u32 *dst);
-int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 logical_offset);
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
+ u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -3069,8 +3070,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+ u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct inode *inode,
@@ -3173,7 +3174,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
-void btrfs_run_delayed_iputs(struct btrfs_root *root);
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@@ -3227,14 +3228,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- struct page **pages, size_t num_pages,
- loff_t pos, size_t write_bytes,
+int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags);
int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len);
@@ -3252,7 +3249,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
-int btrfs_parse_options(struct btrfs_root *root, char *options,
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
@@ -3445,9 +3442,14 @@ do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- WARN(1, KERN_DEBUG \
- "BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
+ if ((errno) != -EIO) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } else { \
+ pr_debug("BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
} \
__btrfs_abort_transaction((trans), __func__, \
__LINE__, (errno)); \
@@ -3609,7 +3611,7 @@ static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
#endif
/* relocation.c */
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
@@ -3628,12 +3630,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
-void btrfs_scrub_pause(struct btrfs_root *root);
-void btrfs_scrub_continue(struct btrfs_root *root);
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
-int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
/* dev-replace.c */
@@ -3648,7 +3650,7 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
/* reada.c */
struct reada_control {
- struct btrfs_root *root; /* tree to prefetch */
+ struct btrfs_fs_info *fs_info; /* tree to prefetch */
struct btrfs_key key_start;
struct btrfs_key key_end; /* exclusive */
atomic_t elems;
@@ -3660,7 +3662,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, u64 start, int err);
+ struct extent_buffer *eb, int err);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0fcf5f25d524..80982a83c9fd 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -72,12 +72,6 @@ static inline int btrfs_is_continuous_delayed_item(
return 0;
}
-static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
- struct btrfs_root *root)
-{
- return root->fs_info->delayed_root;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
{
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
@@ -535,7 +529,7 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
@@ -547,12 +541,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return 0;
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->delayed_block_rsv;
+ dst_rsv = &fs_info->delayed_block_rsv;
- num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
if (!ret) {
- trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
num_bytes, 1);
item->bytes_reserved = num_bytes;
@@ -561,7 +555,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *rsv;
@@ -569,11 +563,11 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
if (!item->bytes_reserved)
return;
- rsv = &root->fs_info->delayed_block_rsv;
- trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ rsv = &fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
- btrfs_block_rsv_release(root, rsv,
+ btrfs_block_rsv_release(fs_info, rsv,
item->bytes_reserved);
}
@@ -583,6 +577,7 @@ static int btrfs_delayed_inode_reserve_metadata(
struct inode *inode,
struct btrfs_delayed_node *node)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
@@ -590,9 +585,9 @@ static int btrfs_delayed_inode_reserve_metadata(
bool release = false;
src_rsv = trans->block_rsv;
- dst_rsv = &root->fs_info->delayed_block_rsv;
+ dst_rsv = &fs_info->delayed_block_rsv;
- num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
/*
* If our block_rsv is the delalloc block reserve then check and see if
@@ -640,7 +635,7 @@ static int btrfs_delayed_inode_reserve_metadata(
ret = -ENOSPC;
if (!ret) {
node->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(root->fs_info,
+ trace_btrfs_space_reservation(fs_info,
"delayed_inode",
btrfs_ino(inode),
num_bytes, 1);
@@ -664,21 +659,21 @@ static int btrfs_delayed_inode_reserve_metadata(
* how block rsvs. work.
*/
if (!ret) {
- trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
node->bytes_reserved = num_bytes;
}
if (release) {
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 0);
- btrfs_block_rsv_release(root, src_rsv, num_bytes);
+ btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
}
return ret;
}
-static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node)
{
struct btrfs_block_rsv *rsv;
@@ -686,10 +681,10 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
if (!node->bytes_reserved)
return;
- rsv = &root->fs_info->delayed_block_rsv;
- trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ rsv = &fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(fs_info, "delayed_inode",
node->inode_id, node->bytes_reserved, 0);
- btrfs_block_rsv_release(root, rsv,
+ btrfs_block_rsv_release(fs_info, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
}
@@ -702,6 +697,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
int free_space;
int total_data_size = 0, total_size = 0;
@@ -718,7 +714,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
BUG_ON(!path->nodes[0]);
leaf = path->nodes[0];
- free_space = btrfs_leaf_free_space(root, leaf);
+ free_space = btrfs_leaf_free_space(fs_info, leaf);
INIT_LIST_HEAD(&head);
next = item;
@@ -791,7 +787,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
curr->data_len);
slot++;
- btrfs_delayed_item_release_metadata(root, curr);
+ btrfs_delayed_item_release_metadata(fs_info, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
@@ -813,6 +809,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *delayed_item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
char *ptr;
int ret;
@@ -830,7 +827,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
delayed_item->data_len);
btrfs_mark_buffer_dirty(leaf);
- btrfs_delayed_item_release_metadata(root, delayed_item);
+ btrfs_delayed_item_release_metadata(fs_info, delayed_item);
return 0;
}
@@ -882,6 +879,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -931,7 +929,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
goto out;
list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ btrfs_delayed_item_release_metadata(fs_info, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
@@ -1017,6 +1015,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_node *node)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
@@ -1073,7 +1072,7 @@ out:
no_iref:
btrfs_release_path(path);
err_out:
- btrfs_delayed_inode_release_metadata(root, node);
+ btrfs_delayed_inode_release_metadata(fs_info, node);
btrfs_release_delayed_inode(node);
return ret;
@@ -1138,7 +1137,7 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
* outstanding delayed items cleaned up.
*/
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int nr)
+ struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
@@ -1156,9 +1155,9 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->delayed_block_rsv;
+ trans->block_rsv = &fs_info->delayed_block_rsv;
- delayed_root = btrfs_get_delayed_root(root);
+ delayed_root = fs_info->delayed_root;
curr_node = btrfs_first_delayed_node(delayed_root);
while (curr_node && (!count || (count && nr--))) {
@@ -1185,15 +1184,15 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
}
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- return __btrfs_run_delayed_items(trans, root, -1);
+ return __btrfs_run_delayed_items(trans, fs_info, -1);
}
int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int nr)
+ struct btrfs_fs_info *fs_info, int nr)
{
- return __btrfs_run_delayed_items(trans, root, nr);
+ return __btrfs_run_delayed_items(trans, fs_info, nr);
}
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
@@ -1236,6 +1235,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
int btrfs_commit_inode_delayed_inode(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
struct btrfs_path *path;
@@ -1267,7 +1267,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
- trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+ trans->block_rsv = &fs_info->delayed_block_rsv;
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
@@ -1280,8 +1280,8 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
btrfs_free_path(path);
trans->block_rsv = block_rsv;
trans_out:
- btrfs_end_transaction(trans, delayed_node->root);
- btrfs_btree_balance_dirty(delayed_node->root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
out:
btrfs_release_delayed_node(delayed_node);
@@ -1345,15 +1345,16 @@ again:
__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
trans->block_rsv = block_rsv;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty_nodelay(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty_nodelay(root->fs_info);
release_path:
btrfs_release_path(path);
total_done++;
btrfs_release_prepared_delayed_node(delayed_node);
- if (async_work->nr == 0 || total_done < async_work->nr)
+ if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
+ total_done < async_work->nr)
goto again;
free_path:
@@ -1369,7 +1370,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
{
struct btrfs_async_delayed_work *async_work;
- if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
+ btrfs_workqueue_normal_congested(fs_info->delayed_workers))
return 0;
async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
@@ -1385,11 +1387,9 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return 0;
}
-void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- struct btrfs_delayed_root *delayed_root;
- delayed_root = btrfs_get_delayed_root(root);
- WARN_ON(btrfs_first_delayed_node(delayed_root));
+ WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
}
static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
@@ -1405,12 +1405,9 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
return 0;
}
-void btrfs_balance_delayed_items(struct btrfs_root *root)
+void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
{
- struct btrfs_delayed_root *delayed_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
-
- delayed_root = btrfs_get_delayed_root(root);
+ struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
return;
@@ -1435,8 +1432,9 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
/* Will return 0 or -ENOMEM */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
- int name_len, struct inode *dir,
+ struct btrfs_fs_info *fs_info,
+ const char *name, int name_len,
+ struct inode *dir,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
@@ -1467,7 +1465,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible
@@ -1478,7 +1476,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
name_len, name, delayed_node->root->objectid,
delayed_node->inode_id, ret);
@@ -1491,7 +1489,7 @@ release_node:
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
struct btrfs_key *key)
{
@@ -1504,15 +1502,15 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
return 1;
}
- btrfs_delayed_item_release_metadata(root, item);
+ btrfs_delayed_item_release_metadata(fs_info, item);
btrfs_release_delayed_item(item);
mutex_unlock(&node->mutex);
return 0;
}
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *dir,
- u64 index)
+ struct btrfs_fs_info *fs_info,
+ struct inode *dir, u64 index)
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
@@ -1527,7 +1525,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
item_key.type = BTRFS_DIR_INDEX_KEY;
item_key.offset = index;
- ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+ ret = btrfs_delete_delayed_insertion_item(fs_info, node, &item_key);
if (!ret)
goto end;
@@ -1539,7 +1537,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
item->key = item_key;
- ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1549,7 +1547,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&node->mutex);
ret = __btrfs_add_delayed_deletion_item(node, item);
if (unlikely(ret)) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
index, node->root->objectid, node->inode_id, ret);
BUG();
@@ -1686,7 +1684,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
*
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list, bool *emitted)
+ struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1730,7 +1728,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
if (over)
return 1;
- *emitted = true;
}
return 0;
}
@@ -1861,6 +1858,7 @@ release_node:
int btrfs_delayed_delete_inode_ref(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_delayed_node *delayed_node;
/*
@@ -1868,8 +1866,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
* leads to enospc problems. This means we also can't do
* delayed inode refs
*/
- if (test_bit(BTRFS_FS_LOG_RECOVERING,
- &BTRFS_I(inode)->root->fs_info->flags))
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return -EAGAIN;
delayed_node = btrfs_get_or_create_delayed_node(inode);
@@ -1896,7 +1893,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
delayed_node->count++;
- atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+ atomic_inc(&fs_info->delayed_root->items);
release_node:
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -1906,12 +1903,13 @@ release_node:
static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
{
struct btrfs_root *root = delayed_node->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr_item, *prev_item;
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
+ btrfs_delayed_item_release_metadata(fs_info, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1919,7 +1917,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
+ btrfs_delayed_item_release_metadata(fs_info, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1929,7 +1927,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- btrfs_delayed_inode_release_metadata(root, delayed_node);
+ btrfs_delayed_inode_release_metadata(fs_info, delayed_node);
btrfs_release_delayed_inode(delayed_node);
}
mutex_unlock(&delayed_node->mutex);
@@ -1976,14 +1974,11 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
}
-void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
{
- struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
- delayed_root = btrfs_get_delayed_root(root);
-
- curr_node = btrfs_first_delayed_node(delayed_root);
+ curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
while (curr_node) {
__btrfs_kill_delayed_node(curr_node);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 2495b3d4075f..8a2bf5e3e4cf 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -99,23 +99,24 @@ static inline void btrfs_init_delayed_root(
}
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
- int name_len, struct inode *dir,
+ struct btrfs_fs_info *fs_info,
+ const char *name, int name_len,
+ struct inode *dir,
struct btrfs_disk_key *disk_key, u8 type,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *dir,
- u64 index);
+ struct btrfs_fs_info *fs_info,
+ struct inode *dir, u64 index);
int btrfs_inode_delayed_dir_index_count(struct inode *inode);
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int nr);
+ struct btrfs_fs_info *fs_info, int nr);
-void btrfs_balance_delayed_items(struct btrfs_root *root);
+void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct inode *inode);
@@ -134,7 +135,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode);
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
/* Used for clean the transaction */
-void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
bool btrfs_readdir_get_delayed_items(struct inode *inode,
@@ -146,13 +147,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list, bool *emitted);
+ struct list_head *ins_list);
/* for init */
int __init btrfs_delayed_inode_init(void);
void btrfs_delayed_inode_exit(void);
/* for debugging */
-void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8d93854a4b4f..ef724a5fc30e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -189,6 +189,8 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
} else {
assert_spin_locked(&head->lock);
list_del(&ref->list);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
}
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
@@ -431,6 +433,15 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
exist->action = ref->action;
mod = -exist->ref_mod;
exist->ref_mod = ref->ref_mod;
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ list_add_tail(&exist->add_list,
+ &href->ref_add_list);
+ else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+ ASSERT(!list_empty(&exist->add_list));
+ list_del(&exist->add_list);
+ } else {
+ ASSERT(0);
+ }
} else
mod = -ref->ref_mod;
}
@@ -444,6 +455,8 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
add_tail:
list_add_tail(&ref->list, &href->ref_list);
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
+ list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
trans->delayed_ref_updates++;
spin_unlock(&href->lock);
@@ -590,6 +603,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
INIT_LIST_HEAD(&head_ref->ref_list);
+ INIT_LIST_HEAD(&head_ref->ref_add_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
head_ref->qgroup_reserved = 0;
@@ -606,7 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
- if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
+ if(btrfs_qgroup_trace_extent_nolock(fs_info,
delayed_refs, qrecord))
kfree(qrecord);
}
@@ -671,6 +685,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
+ INIT_LIST_HEAD(&ref->list);
+ INIT_LIST_HEAD(&ref->add_list);
full_ref = btrfs_delayed_node_to_tree_ref(ref);
full_ref->parent = parent;
@@ -726,6 +742,8 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
+ INIT_LIST_HEAD(&ref->list);
+ INIT_LIST_HEAD(&ref->add_list);
full_ref = btrfs_delayed_node_to_data_ref(ref);
full_ref->parent = parent;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 43f3629760e9..50947b5a9152 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -34,14 +34,14 @@
* ref_head. Must clean this mess up later.
*/
struct btrfs_delayed_ref_node {
- /*
- * ref_head use rb tree, stored in ref_root->href.
- * indexed by bytenr
- */
- struct rb_node rb_node;
-
/*data/tree ref use list, stored in ref_head->ref_list. */
struct list_head list;
+ /*
+ * If action is BTRFS_ADD_DELAYED_REF, also link this node to
+ * ref_head->ref_add_list, then we do not need to iterate the
+ * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
+ */
+ struct list_head add_list;
/* the starting bytenr of the extent */
u64 bytenr;
@@ -99,6 +99,8 @@ struct btrfs_delayed_ref_head {
spinlock_t lock;
struct list_head ref_list;
+ /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
+ struct list_head ref_add_list;
struct rb_node href_node;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 05169ef30596..5de280b9ad73 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found:
* missing
*/
if (!dev_replace->srcdev &&
- !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
+ !btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found:
src_devid);
}
if (!dev_replace->tgtdev &&
- !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
+ !btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -304,11 +304,11 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
-int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name,
u64 srcdevid, char *srcdev_name, int read_src)
{
+ struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret;
struct btrfs_device *tgt_device = NULL;
@@ -316,14 +316,14 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
- ret = btrfs_find_device_by_devspec(root, srcdevid,
+ ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name, &src_device);
if (ret) {
mutex_unlock(&fs_info->volume_mutex);
return ret;
}
- ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
+ ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
mutex_unlock(&fs_info->volume_mutex);
if (ret)
@@ -335,7 +335,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
*/
trans = btrfs_attach_transaction(root);
if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
} else if (PTR_ERR(trans) != -ENOENT) {
@@ -387,7 +387,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -397,7 +397,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
goto leave;
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
/* the disk copy procedure reuses the scrub code */
@@ -422,7 +422,7 @@ leave:
return ret;
}
-int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
int ret;
@@ -439,7 +439,7 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
+ ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
args->start.srcdev_name,
args->start.cont_reading_from_srcdev_mode);
@@ -501,25 +501,25 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
mutex_lock(&uuid_mutex);
/* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- mutex_lock(&root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace, 1);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -535,15 +535,15 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- btrfs_err_in_rcu(root->fs_info,
- "btrfs_scrub_dev(%s, %llu, %s) failed %d",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
- src_device->devid,
- rcu_str_deref(tgt_device->name), scrub_ret);
+ btrfs_err_in_rcu(fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace, 1);
- mutex_unlock(&root->fs_info->chunk_mutex);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -552,12 +552,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
return scrub_ret;
}
- btrfs_info_in_rcu(root->fs_info,
- "dev_replace from %s (devid %llu) to %s finished",
- src_device->missing ? "<missing disk>" :
- rcu_str_deref(src_device->name),
- src_device->devid,
- rcu_str_deref(tgt_device->name));
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
+ src_device->missing ? "<missing disk>" :
+ rcu_str_deref(src_device->name),
+ src_device->devid,
+ rcu_str_deref(tgt_device->name));
tgt_device->is_tgtdev_for_dev_replace = 0;
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
@@ -592,8 +592,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
- mutex_unlock(&root->fs_info->chunk_mutex);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
@@ -603,7 +603,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
if (!IS_ERR(trans))
- btrfs_commit_transaction(trans, root);
+ btrfs_commit_transaction(trans);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -718,7 +718,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index e922b42d91df..54ea12bda15b 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,9 +25,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name,
u64 srcdevid, char *srcdev_name, int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0dc1a033275e..b039fe0c751a 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -38,6 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
const char *name,
int name_len)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
struct btrfs_item *item;
@@ -46,10 +47,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) {
struct btrfs_dir_item *di;
- di = btrfs_match_dir_item_name(root, path, name, name_len);
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- btrfs_extend_item(root, path, data_size);
+ btrfs_extend_item(fs_info, path, data_size);
} else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
@@ -79,7 +80,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 data_size;
- BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+ BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info));
key.objectid = objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -172,8 +173,9 @@ second_insert:
}
btrfs_release_path(path);
- ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
- &disk_key, type, index);
+ ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name,
+ name_len, dir, &disk_key, type,
+ index);
out_free:
btrfs_free_path(path);
if (ret)
@@ -210,7 +212,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
if (ret > 0)
return NULL;
- return btrfs_match_dir_item_name(root, path, name, name_len);
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -246,7 +248,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root, path, name, name_len);
+ di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -261,7 +263,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size_nr(leaf, slot) +
- sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
/* plenty of insertion room */
@@ -301,7 +303,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
if (ret > 0)
return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root, path, name, name_len);
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
}
struct btrfs_dir_item *
@@ -342,7 +344,8 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
- di = btrfs_match_dir_item_name(root, path, name, name_len);
+ di = btrfs_match_dir_item_name(root->fs_info, path,
+ name, name_len);
if (di)
return di;
@@ -371,7 +374,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
if (ret > 0)
return NULL;
- return btrfs_match_dir_item_name(root, path, name, name_len);
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
}
/*
@@ -379,7 +382,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name, int name_len)
{
@@ -392,7 +395,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- if (verify_dir_item(root, leaf, dir_item))
+ if (verify_dir_item(fs_info, leaf, dir_item))
return NULL;
total_len = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -442,12 +445,13 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
+ btrfs_truncate_item(root->fs_info, path,
+ item_len - sub_item_len, 1);
}
return ret;
}
-int verify_dir_item(struct btrfs_root *root,
+int verify_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item)
{
@@ -455,8 +459,7 @@ int verify_dir_item(struct btrfs_root *root,
u8 type = btrfs_dir_type(leaf, dir_item);
if (type >= BTRFS_FT_MAX) {
- btrfs_crit(root->fs_info, "invalid dir item type: %d",
- (int)type);
+ btrfs_crit(fs_info, "invalid dir item type: %d", (int)type);
return 1;
}
@@ -464,16 +467,16 @@ int verify_dir_item(struct btrfs_root *root,
namelen = XATTR_NAME_MAX;
if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
- btrfs_crit(root->fs_info, "invalid dir item name len: %u",
+ btrfs_crit(fs_info, "invalid dir item name len: %u",
(unsigned)btrfs_dir_data_len(leaf, dir_item));
return 1;
}
/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
if ((btrfs_dir_data_len(leaf, dir_item) +
- btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
- btrfs_crit(root->fs_info,
- "invalid dir item name + data len: %u + %u",
+ btrfs_dir_name_len(leaf, dir_item)) >
+ BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u",
(unsigned)btrfs_dir_name_len(leaf, dir_item),
(unsigned)btrfs_dir_data_len(leaf, dir_item));
return 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3a57f99d96aa..18004169552c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -68,15 +68,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
-static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages,
int mark);
-static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
struct extent_io_tree *pinned_extents);
-static int btrfs_cleanup_transaction(struct btrfs_root *root);
-static void btrfs_error_commit_super(struct btrfs_root *root);
+static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
+static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
/*
* btrfs_end_io_wq structs are used to do processing in task context when an IO
@@ -224,6 +224,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret;
@@ -231,8 +232,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
- em->bdev =
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
goto out;
}
@@ -247,7 +247,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
em->block_start = 0;
- em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
@@ -271,7 +271,7 @@ u32 btrfs_csum_data(char *data, u32 seed, size_t len)
return btrfs_crc32c(seed, data, len);
}
-void btrfs_csum_final(u32 crc, char *result)
+void btrfs_csum_final(u32 crc, u8 *result)
{
put_unaligned_le32(~crc, result);
}
@@ -440,7 +440,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*/
-static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
u64 parent_transid)
{
@@ -452,7 +452,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
int failed_mirror = 0;
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
btree_get_extent, mirror_num);
@@ -472,7 +472,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
break;
- num_copies = btrfs_num_copies(root->fs_info,
+ num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
if (num_copies == 1)
break;
@@ -491,7 +491,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
}
if (failed && !ret && failed_mirror)
- repair_eb_io_failure(root, eb, failed_mirror);
+ repair_eb_io_failure(fs_info, eb, failed_mirror);
return ret;
}
@@ -545,47 +545,63 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
return ret;
}
-#define CORRUPT(reason, eb, root, slot) \
- btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu," \
- " root=%llu, slot=%d", \
- btrfs_header_level(eb) == 0 ? "leaf" : "node",\
+#define CORRUPT(reason, eb, root, slot) \
+ btrfs_crit(root->fs_info, \
+ "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
+ btrfs_header_level(eb) == 0 ? "leaf" : "node", \
reason, btrfs_header_bytenr(eb), root->objectid, slot)
static noinline int check_leaf(struct btrfs_root *root,
struct extent_buffer *leaf)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key leaf_key;
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (nritems == 0) {
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So skip
+ * this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
struct btrfs_root *check_root;
key.objectid = btrfs_header_owner(leaf);
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+ check_root = btrfs_get_fs_root(fs_info, &key, false);
/*
* The only reason we also check NULL here is that during
* open_ctree() some roots has not yet been set up.
*/
if (!IS_ERR_OR_NULL(check_root)) {
+ struct extent_buffer *eb;
+
+ eb = btrfs_root_node(check_root);
/* if leaf is the root, then it's fine */
- if (leaf->start !=
- btrfs_root_bytenr(&check_root->root_item)) {
+ if (leaf != eb) {
CORRUPT("non-root leaf's nritems is 0",
- leaf, root, 0);
+ leaf, check_root, 0);
+ free_extent_buffer(eb);
return -EIO;
}
+ free_extent_buffer(eb);
}
return 0;
}
+ if (nritems == 0)
+ return 0;
+
/* Check the 0 item */
if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
- BTRFS_LEAF_DATA_SIZE(root)) {
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
CORRUPT("invalid item offset size pair", leaf, root, 0);
return -EIO;
}
@@ -624,7 +640,7 @@ static noinline int check_leaf(struct btrfs_root *root,
* all point outside of the leaf.
*/
if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(root)) {
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
CORRUPT("slot end outside of leaf", leaf, root, slot);
return -EIO;
}
@@ -641,7 +657,7 @@ static int check_node(struct btrfs_root *root, struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
btrfs_crit(root->fs_info,
"corrupt node: block %llu root %llu nritems %lu",
node->start, root->objectid, nr);
@@ -747,7 +763,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(fs_info, eb, eb->start, ret);
+ btree_readahead_hook(fs_info, eb, ret);
if (ret) {
/*
@@ -772,7 +788,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
+ btree_readahead_hook(eb->fs_info, eb, -EIO);
return -EIO; /* we fixed nothing */
}
@@ -930,7 +946,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
- if (bio->bi_opf & REQ_SYNC)
+ if (op_is_sync(bio->bi_opf))
btrfs_set_work_high_priority(&async->work);
btrfs_queue_work(fs_info->workers, &async->work);
@@ -981,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
- ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -1004,6 +1020,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(inode, bio_flags);
int ret;
@@ -1012,23 +1029,22 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
- ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
- bio, BTRFS_WQ_ENDIO_METADATA);
+ ret = btrfs_bio_wq_end_io(fs_info, bio,
+ BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, bio, mirror_num, 0,
+ ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0,
bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
@@ -1146,12 +1162,12 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
+void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct extent_buffer *buf = NULL;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct inode *btree_inode = fs_info->btree_inode;
- buf = btrfs_find_create_tree_block(root, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(buf))
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1159,15 +1175,15 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
free_extent_buffer(buf);
}
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
int mirror_num, struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
- struct inode *btree_inode = root->fs_info->btree_inode;
+ struct inode *btree_inode = fs_info->btree_inode;
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(buf))
return 0;
@@ -1191,19 +1207,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
return 0;
}
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr)
-{
- return find_extent_buffer(fs_info, bytenr);
-}
-
-struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr)
+struct extent_buffer *btrfs_find_create_tree_block(
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
- if (btrfs_is_testing(root->fs_info))
- return alloc_test_extent_buffer(root->fs_info, bytenr,
- root->nodesize);
- return alloc_extent_buffer(root->fs_info, bytenr);
+ if (btrfs_is_testing(fs_info))
+ return alloc_test_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr);
}
@@ -1219,17 +1229,17 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
buf->start, buf->start + buf->len - 1);
}
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 parent_transid)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(root, buf, parent_transid);
+ ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
if (ret) {
free_extent_buffer(buf);
return ERR_PTR(ret);
@@ -1283,16 +1293,12 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
kfree(writers);
}
-static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
- struct btrfs_root *root, struct btrfs_fs_info *fs_info,
+static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
root->node = NULL;
root->commit_root = NULL;
- root->sectorsize = sectorsize;
- root->nodesize = nodesize;
- root->stripesize = stripesize;
root->state = 0;
root->orphan_cleanup_state = 0;
@@ -1370,8 +1376,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
-struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
- u32 sectorsize, u32 nodesize)
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
@@ -1381,9 +1386,9 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
+
/* We don't use the stripesize in selftest, set it as sectorsize */
- __setup_root(nodesize, sectorsize, sectorsize, root, fs_info,
- BTRFS_ROOT_TREE_OBJECTID);
+ __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
root->alloc_bytenr = 0;
return root;
@@ -1405,8 +1410,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->sectorsize,
- tree_root->stripesize, root, fs_info, objectid);
+ __setup_root(root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
@@ -1418,18 +1422,15 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
goto fail;
}
- memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(leaf, leaf->start);
btrfs_set_header_generation(leaf, trans->transid);
btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(leaf, objectid);
root->node = leaf;
- write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
- BTRFS_FSID_SIZE);
- write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(leaf),
- BTRFS_UUID_SIZE);
+ write_extent_buffer_fsid(leaf, fs_info->fsid);
+ write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
btrfs_mark_buffer_dirty(leaf);
root->commit_root = btrfs_root_node(root);
@@ -1474,16 +1475,13 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
root = btrfs_alloc_root(fs_info, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(tree_root->nodesize, tree_root->sectorsize,
- tree_root->stripesize, root, fs_info,
- BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1505,15 +1503,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
return ERR_CAST(leaf);
}
- memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(leaf, leaf->start);
btrfs_set_header_generation(leaf, trans->transid);
btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
root->node = leaf;
- write_extent_buffer(root->node, root->fs_info->fsid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE);
+ write_extent_buffer_fsid(root->node, fs_info->fsid);
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
return root;
@@ -1535,10 +1532,11 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
- log_root = alloc_log_tree(trans, root->fs_info);
+ log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -1549,7 +1547,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+ btrfs_set_stack_inode_nbytes(inode_item,
+ fs_info->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1581,8 +1580,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
goto alloc_fail;
}
- __setup_root(tree_root->nodesize, tree_root->sectorsize,
- tree_root->stripesize, root, fs_info, key->objectid);
+ __setup_root(root, fs_info, key->objectid);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
@@ -1593,7 +1591,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
- root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+ root->node = read_tree_block(fs_info,
+ btrfs_root_bytenr(&root->root_item),
generation);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
@@ -1848,6 +1847,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int again;
struct btrfs_trans_handle *trans;
@@ -1855,40 +1855,40 @@ static int cleaner_kthread(void *arg)
again = 0;
/* Make the cleaner go to sleep early. */
- if (btrfs_need_cleaner_sleep(root))
+ if (btrfs_need_cleaner_sleep(fs_info))
goto sleep;
/*
* Do not do anything if we might cause open_ctree() to block
* before we have finished mounting the filesystem.
*/
- if (!test_bit(BTRFS_FS_OPEN, &root->fs_info->flags))
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
goto sleep;
- if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+ if (!mutex_trylock(&fs_info->cleaner_mutex))
goto sleep;
/*
* Avoid the problem that we change the status of the fs
* during the above check and trylock.
*/
- if (btrfs_need_cleaner_sleep(root)) {
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ if (btrfs_need_cleaner_sleep(fs_info)) {
+ mutex_unlock(&fs_info->cleaner_mutex);
goto sleep;
}
- mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
- btrfs_run_delayed_iputs(root);
- mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ mutex_unlock(&fs_info->cleaner_mutex);
/*
* The defragger has dealt with the R/O remount and umount,
* needn't do anything special here.
*/
- btrfs_run_defrag_inodes(root->fs_info);
+ btrfs_run_defrag_inodes(fs_info);
/*
* Acquires fs_info->delete_unused_bgs_mutex to avoid racing
@@ -1898,7 +1898,7 @@ static int cleaner_kthread(void *arg)
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
- btrfs_delete_unused_bgs(root->fs_info);
+ btrfs_delete_unused_bgs(fs_info);
sleep:
if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1922,15 +1922,15 @@ sleep:
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"cleaner transaction attach returned %ld",
PTR_ERR(trans));
} else {
int ret;
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"cleaner open transaction commit returned %d",
ret);
}
@@ -1941,6 +1941,7 @@ sleep:
static int transaction_kthread(void *arg)
{
struct btrfs_root *root = arg;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
@@ -1950,26 +1951,26 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = HZ * root->fs_info->commit_interval;
- mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ delay = HZ * fs_info->commit_interval;
+ mutex_lock(&fs_info->transaction_kthread_mutex);
- spin_lock(&root->fs_info->trans_lock);
- cur = root->fs_info->running_transaction;
+ spin_lock(&fs_info->trans_lock);
+ cur = fs_info->running_transaction;
if (!cur) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
goto sleep;
}
now = get_seconds();
if (cur->state < TRANS_STATE_BLOCKED &&
(now < cur->start_time ||
- now - cur->start_time < root->fs_info->commit_interval)) {
- spin_unlock(&root->fs_info->trans_lock);
+ now - cur->start_time < fs_info->commit_interval)) {
+ spin_unlock(&fs_info->trans_lock);
delay = HZ * 5;
goto sleep;
}
transid = cur->transid;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
/* If the file system is aborted, this will always fail. */
trans = btrfs_attach_transaction(root);
@@ -1979,20 +1980,20 @@ static int transaction_kthread(void *arg)
goto sleep;
}
if (transid == trans->transid) {
- btrfs_commit_transaction(trans, root);
+ btrfs_commit_transaction(trans);
} else {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
}
sleep:
- wake_up_process(root->fs_info->cleaner_kthread);
- mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ wake_up_process(fs_info->cleaner_kthread);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &root->fs_info->fs_state)))
- btrfs_cleanup_transaction(root);
+ &fs_info->fs_state)))
+ btrfs_cleanup_transaction(fs_info);
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
- (!btrfs_transaction_blocked(root->fs_info) ||
+ (!btrfs_transaction_blocked(fs_info) ||
cannot_commit))
schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
@@ -2279,8 +2280,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_free_log_root_tree(NULL, fs_info);
- btrfs_destroy_pinned_extent(fs_info->tree_root,
- fs_info->pinned_extents);
+ btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
}
}
@@ -2306,33 +2306,31 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
init_waitqueue_head(&fs_info->balance_wait_q);
}
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_root *tree_root)
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
- fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- set_nlink(fs_info->btree_inode, 1);
+ struct inode *inode = fs_info->btree_inode;
+
+ inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ set_nlink(inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
* the devices in the system
*/
- fs_info->btree_inode->i_size = OFFSET_MAX;
- fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &btree_aops;
- RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
- fs_info->btree_inode->i_mapping);
- BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
- extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+ RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping);
+ BTRFS_I(inode)->io_tree.track_uptodate = 0;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- set_bit(BTRFS_INODE_DUMMY,
- &BTRFS_I(fs_info->btree_inode)->runtime_flags);
- btrfs_insert_inode_hash(fs_info->btree_inode);
+ BTRFS_I(inode)->root = fs_info->tree_root;
+ memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
+ set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ btrfs_insert_inode_hash(inode);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2453,7 +2451,6 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
- struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
@@ -2467,12 +2464,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (!log_tree_root)
return -ENOMEM;
- __setup_root(tree_root->nodesize, tree_root->sectorsize,
- tree_root->stripesize, log_tree_root, fs_info,
- BTRFS_TREE_LOG_OBJECTID);
+ __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
- log_tree_root->node = read_tree_block(tree_root, bytenr,
- fs_info->generation + 1);
+ log_tree_root->node = read_tree_block(fs_info, bytenr,
+ fs_info->generation + 1);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2487,15 +2482,15 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_handle_fs_error(tree_root->fs_info, ret,
- "Failed to recover log tree");
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
return ret;
}
if (fs_info->sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
+ ret = btrfs_commit_super(fs_info);
if (ret)
return ret;
}
@@ -2503,13 +2498,15 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return 0;
}
-static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
- struct btrfs_root *tree_root)
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key location;
int ret;
+ BUG_ON(!fs_info->tree_root);
+
location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
@@ -2720,7 +2717,7 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
- btrfs_init_btree_inode(fs_info, tree_root);
+ btrfs_init_btree_inode(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2758,14 +2755,18 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->pinned_chunks);
+ /* Usable values until the real ones are cached from the superblock */
+ fs_info->nodesize = 4096;
+ fs_info->sectorsize = 4096;
+ fs_info->stripesize = 4096;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
goto fail_alloc;
}
- __setup_root(4096, 4096, 4096, tree_root,
- fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
invalidate_bdev(fs_devices->latest_bdev);
@@ -2829,7 +2830,7 @@ int open_ctree(struct super_block *sb,
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- ret = btrfs_parse_options(tree_root, options, sb->s_flags);
+ ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
goto fail_alloc;
@@ -2847,7 +2848,7 @@ int open_ctree(struct super_block *sb,
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
@@ -2870,6 +2871,11 @@ int open_ctree(struct super_block *sb,
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+ /* Cache block sizes */
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->stripesize = stripesize;
+
/*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
@@ -2910,15 +2916,11 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
SZ_4M / PAGE_SIZE);
- tree_root->nodesize = nodesize;
- tree_root->sectorsize = sectorsize;
- tree_root->stripesize = stripesize;
-
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
mutex_lock(&fs_info->chunk_mutex);
- ret = btrfs_read_sys_array(tree_root);
+ ret = btrfs_read_sys_array(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_err(fs_info, "failed to read the system array: %d", ret);
@@ -2927,10 +2929,9 @@ int open_ctree(struct super_block *sb,
generation = btrfs_super_chunk_root_generation(disk_super);
- __setup_root(nodesize, sectorsize, stripesize, chunk_root,
- fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+ __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
- chunk_root->node = read_tree_block(chunk_root,
+ chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
generation);
if (IS_ERR(chunk_root->node) ||
@@ -2947,7 +2948,7 @@ int open_ctree(struct super_block *sb,
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
- ret = btrfs_read_chunk_tree(chunk_root);
+ ret = btrfs_read_chunk_tree(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
goto fail_tree_roots;
@@ -2967,7 +2968,7 @@ int open_ctree(struct super_block *sb,
retry_root_backup:
generation = btrfs_super_generation(disk_super);
- tree_root->node = read_tree_block(tree_root,
+ tree_root->node = read_tree_block(fs_info,
btrfs_super_root(disk_super),
generation);
if (IS_ERR(tree_root->node) ||
@@ -2995,7 +2996,7 @@ retry_root_backup:
mutex_unlock(&tree_root->objectid_mutex);
- ret = btrfs_read_roots(fs_info, tree_root);
+ ret = btrfs_read_roots(fs_info);
if (ret)
goto recovery_tree_root;
@@ -3048,7 +3049,7 @@ retry_root_backup:
goto fail_sysfs;
}
- ret = btrfs_read_block_groups(fs_info->extent_root);
+ ret = btrfs_read_block_groups(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to read block groups: %d", ret);
goto fail_sysfs;
@@ -3076,8 +3077,8 @@ retry_root_backup:
if (IS_ERR(fs_info->transaction_kthread))
goto fail_cleaner;
- if (!btrfs_test_opt(tree_root->fs_info, SSD) &&
- !btrfs_test_opt(tree_root->fs_info, NOSSD) &&
+ if (!btrfs_test_opt(fs_info, SSD) &&
+ !btrfs_test_opt(fs_info, NOSSD) &&
!fs_info->fs_devices->rotating) {
btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
btrfs_set_opt(fs_info->mount_opt, SSD);
@@ -3090,9 +3091,9 @@ retry_root_backup:
btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) {
- ret = btrfsic_mount(tree_root, fs_devices,
- btrfs_test_opt(tree_root->fs_info,
+ if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(fs_info, fs_devices,
+ btrfs_test_opt(fs_info,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
1 : 0,
fs_info->check_integrity_print_mask);
@@ -3108,7 +3109,7 @@ retry_root_backup:
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
- !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3116,7 +3117,7 @@ retry_root_backup:
}
}
- ret = btrfs_find_orphan_roots(tree_root);
+ ret = btrfs_find_orphan_roots(fs_info);
if (ret)
goto fail_qgroup;
@@ -3164,19 +3165,19 @@ retry_root_backup:
if (ret) {
btrfs_warn(fs_info,
"failed to clear free space tree: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
}
- if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
btrfs_info(fs_info, "creating free space tree");
ret = btrfs_create_free_space_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to create free space tree: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
}
@@ -3185,7 +3186,7 @@ retry_root_backup:
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
up_read(&fs_info->cleanup_work_sem);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
up_read(&fs_info->cleanup_work_sem);
@@ -3193,14 +3194,14 @@ retry_root_backup:
ret = btrfs_resume_balance_async(fs_info);
if (ret) {
btrfs_warn(fs_info, "failed to resume balance: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
@@ -3212,10 +3213,10 @@ retry_root_backup:
if (ret) {
btrfs_warn(fs_info,
"failed to create the UUID tree: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
- } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) ||
+ } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
fs_info->generation !=
btrfs_super_uuid_tree_generation(disk_super)) {
btrfs_info(fs_info, "checking UUID tree");
@@ -3223,7 +3224,7 @@ retry_root_backup:
if (ret) {
btrfs_warn(fs_info,
"failed to check the UUID tree: %d", ret);
- close_ctree(tree_root);
+ close_ctree(fs_info);
return ret;
}
} else {
@@ -3243,7 +3244,7 @@ fail_qgroup:
btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
- btrfs_cleanup_transaction(fs_info->tree_root);
+ btrfs_cleanup_transaction(fs_info);
btrfs_free_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -3291,7 +3292,7 @@ fail:
return err;
recovery_tree_root:
- if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
goto fail_tree_roots;
free_root_pointers(fs_info, 0);
@@ -3317,7 +3318,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s",
rcu_str_deref(device->name));
/* note, we don't set_buffer_write_io_error because we have
@@ -3462,7 +3463,7 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- btrfs_err(device->dev_root->fs_info,
+ btrfs_err(device->fs_info,
"couldn't get super buffer head for bytenr %llu",
bytenr);
errors++;
@@ -3485,9 +3486,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to go down lazy.
*/
if (i == 0)
- ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
else
- ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
if (ret)
errors++;
}
@@ -3551,7 +3552,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
bio->bi_end_io = btrfs_end_empty_barrier;
bio->bi_bdev = device->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
device->flush_bio = bio;
@@ -3695,7 +3696,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
return num_tolerated_disk_barrier_failures;
}
-static int write_all_supers(struct btrfs_root *root, int max_mirrors)
+static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
{
struct list_head *head;
struct btrfs_device *dev;
@@ -3707,23 +3708,23 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER);
- backup_super_roots(root->fs_info);
+ do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
+ backup_super_roots(fs_info);
- sb = root->fs_info->super_for_commit;
+ sb = fs_info->super_for_commit;
dev_item = &sb->dev_item;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- head = &root->fs_info->fs_devices->devices;
- max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ head = &fs_info->fs_devices->devices;
+ max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
if (do_barriers) {
- ret = barrier_all_devices(root->fs_info);
+ ret = barrier_all_devices(fs_info);
if (ret) {
mutex_unlock(
- &root->fs_info->fs_devices->device_list_mutex);
- btrfs_handle_fs_error(root->fs_info, ret,
- "errors while submitting device barriers.");
+ &fs_info->fs_devices->device_list_mutex);
+ btrfs_handle_fs_error(fs_info, ret,
+ "errors while submitting device barriers.");
return ret;
}
}
@@ -3757,13 +3758,14 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
total_errors++;
}
if (total_errors > max_errors) {
- btrfs_err(root->fs_info, "%d errors while writing supers",
- total_errors);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ btrfs_err(fs_info, "%d errors while writing supers",
+ total_errors);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_handle_fs_error(root->fs_info, -EIO,
- "%d errors while writing supers", total_errors);
+ btrfs_handle_fs_error(fs_info, -EIO,
+ "%d errors while writing supers",
+ total_errors);
return -EIO;
}
@@ -3778,19 +3780,20 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret)
total_errors++;
}
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_handle_fs_error(root->fs_info, -EIO,
- "%d errors while writing supers", total_errors);
+ btrfs_handle_fs_error(fs_info, -EIO,
+ "%d errors while writing supers",
+ total_errors);
return -EIO;
}
return 0;
}
int write_ctree_super(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int max_mirrors)
+ struct btrfs_fs_info *fs_info, int max_mirrors)
{
- return write_all_supers(root, max_mirrors);
+ return write_all_supers(fs_info, max_mirrors);
}
/* Drop a fs root from the radix tree and free it. */
@@ -3826,7 +3829,7 @@ static void free_fs_root(struct btrfs_root *root)
{
iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -3896,28 +3899,29 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
return err;
}
-int btrfs_commit_super(struct btrfs_root *root)
+int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- wake_up_process(root->fs_info->cleaner_kthread);
+ mutex_lock(&fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ wake_up_process(fs_info->cleaner_kthread);
/* wait until ongoing cleanup work done */
- down_write(&root->fs_info->cleanup_work_sem);
- up_write(&root->fs_info->cleanup_work_sem);
+ down_write(&fs_info->cleanup_work_sem);
+ up_write(&fs_info->cleanup_work_sem);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- return btrfs_commit_transaction(trans, root);
+ return btrfs_commit_transaction(trans);
}
-void close_ctree(struct btrfs_root *root)
+void close_ctree(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
@@ -3952,15 +3956,15 @@ void close_ctree(struct btrfs_root *root)
* block groups queued for removal, the deletion will be
* skipped when we quit the cleaner thread.
*/
- btrfs_delete_unused_bgs(root->fs_info);
+ btrfs_delete_unused_bgs(fs_info);
- ret = btrfs_commit_super(root);
+ ret = btrfs_commit_super(fs_info);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
}
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- btrfs_error_commit_super(root);
+ btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -3996,8 +4000,8 @@ void close_ctree(struct btrfs_root *root)
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
- btrfsic_unmount(root, fs_info->fs_devices);
+ if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
+ btrfsic_unmount(fs_info->fs_devices);
#endif
btrfs_close_devices(fs_info->fs_devices);
@@ -4014,7 +4018,7 @@ void close_ctree(struct btrfs_root *root)
__btrfs_free_block_rsv(root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
while (!list_empty(&fs_info->pinned_chunks)) {
struct extent_map *em;
@@ -4023,7 +4027,7 @@ void close_ctree(struct btrfs_root *root)
list_del_init(&em->list);
free_extent_map(em);
}
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4045,6 +4049,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
@@ -4059,24 +4064,25 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
return;
#endif
root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ fs_info = root->fs_info;
btrfs_assert_tree_locked(buf);
- if (transid != root->fs_info->generation)
+ if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
- buf->start, transid, root->fs_info->generation);
+ buf->start, transid, fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty)
- __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+ __percpu_counter_add(&fs_info->dirty_metadata_bytes,
buf->len,
- root->fs_info->dirty_metadata_batch);
+ fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
- btrfs_print_leaf(root, buf);
+ btrfs_print_leaf(fs_info, buf);
ASSERT(0);
}
#endif
}
-static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
int flush_delayed)
{
/*
@@ -4089,30 +4095,31 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
return;
if (flush_delayed)
- btrfs_balance_delayed_items(root);
+ btrfs_balance_delayed_items(fs_info);
- ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+ ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH);
if (ret > 0) {
- balance_dirty_pages_ratelimited(
- root->fs_info->btree_inode->i_mapping);
+ balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
}
}
-void btrfs_btree_balance_dirty(struct btrfs_root *root)
+void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
{
- __btrfs_btree_balance_dirty(root, 1);
+ __btrfs_btree_balance_dirty(fs_info, 1);
}
-void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
{
- __btrfs_btree_balance_dirty(root, 0);
+ __btrfs_btree_balance_dirty(fs_info, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- return btree_read_extent_buffer_pages(root, buf, parent_transid);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
}
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -4263,17 +4270,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
return ret;
}
-static void btrfs_error_commit_super(struct btrfs_root *root)
+static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ mutex_lock(&fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
- down_write(&root->fs_info->cleanup_work_sem);
- up_write(&root->fs_info->cleanup_work_sem);
+ down_write(&fs_info->cleanup_work_sem);
+ up_write(&fs_info->cleanup_work_sem);
/* cleanup FS via transaction */
- btrfs_cleanup_transaction(root);
+ btrfs_cleanup_transaction(fs_info);
}
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
@@ -4316,7 +4323,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -4328,7 +4335,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
- btrfs_info(root->fs_info, "delayed_refs has NO entry");
+ btrfs_info(fs_info, "delayed_refs has NO entry");
return ret;
}
@@ -4354,6 +4361,8 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list) {
ref->in_tree = 0;
list_del(&ref->list);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
@@ -4371,7 +4380,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
mutex_unlock(&head->mutex);
if (pin_bytes)
- btrfs_pin_extent(root, head->node.bytenr,
+ btrfs_pin_extent(fs_info, head->node.bytenr,
head->node.num_bytes, 1);
btrfs_put_delayed_ref(&head->node);
cond_resched();
@@ -4435,7 +4444,7 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->delalloc_root_lock);
}
-static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages,
int mark)
{
@@ -4452,8 +4461,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
- eb = btrfs_find_tree_block(root->fs_info, start);
- start += root->nodesize;
+ eb = find_extent_buffer(fs_info, start);
+ start += fs_info->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
@@ -4468,7 +4477,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
return ret;
}
-static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
struct extent_io_tree *pinned_extents)
{
struct extent_io_tree *unpin;
@@ -4486,15 +4495,15 @@ again:
break;
clear_extent_dirty(unpin, start, end);
- btrfs_error_unpin_extent_range(root, start, end);
+ btrfs_error_unpin_extent_range(fs_info, start, end);
cond_resched();
}
if (loop) {
- if (unpin == &root->fs_info->freed_extents[0])
- unpin = &root->fs_info->freed_extents[1];
+ if (unpin == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
else
- unpin = &root->fs_info->freed_extents[0];
+ unpin = &fs_info->freed_extents[0];
loop = false;
goto again;
}
@@ -4517,7 +4526,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
}
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *cache;
@@ -4527,8 +4536,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_block_group_cache,
dirty_list);
if (!cache) {
- btrfs_err(root->fs_info,
- "orphan block group dirty_bgs list");
+ btrfs_err(fs_info, "orphan block group dirty_bgs list");
spin_unlock(&cur_trans->dirty_bgs_lock);
return;
}
@@ -4556,8 +4564,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_block_group_cache,
io_list);
if (!cache) {
- btrfs_err(root->fs_info,
- "orphan block group on io_bgs list");
+ btrfs_err(fs_info, "orphan block group on io_bgs list");
return;
}
@@ -4570,27 +4577,27 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- btrfs_cleanup_dirty_bgs(cur_trans, root);
+ btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
- btrfs_destroy_delayed_refs(cur_trans, root);
+ btrfs_destroy_delayed_refs(cur_trans, fs_info);
cur_trans->state = TRANS_STATE_COMMIT_START;
- wake_up(&root->fs_info->transaction_blocked_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
cur_trans->state = TRANS_STATE_UNBLOCKED;
- wake_up(&root->fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_wait);
- btrfs_destroy_delayed_inodes(root);
- btrfs_assert_delayed_root_empty(root);
+ btrfs_destroy_delayed_inodes(fs_info);
+ btrfs_assert_delayed_root_empty(fs_info);
- btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+ btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
- btrfs_destroy_pinned_extent(root,
- root->fs_info->pinned_extents);
+ btrfs_destroy_pinned_extent(fs_info,
+ fs_info->pinned_extents);
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
@@ -4601,27 +4608,27 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
*/
}
-static int btrfs_cleanup_transaction(struct btrfs_root *root)
+static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
{
struct btrfs_transaction *t;
- mutex_lock(&root->fs_info->transaction_kthread_mutex);
+ mutex_lock(&fs_info->transaction_kthread_mutex);
- spin_lock(&root->fs_info->trans_lock);
- while (!list_empty(&root->fs_info->trans_list)) {
- t = list_first_entry(&root->fs_info->trans_list,
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&fs_info->trans_list)) {
+ t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
if (t->state >= TRANS_STATE_COMMIT_START) {
atomic_inc(&t->use_count);
- spin_unlock(&root->fs_info->trans_lock);
- btrfs_wait_for_commit(root, t->transid);
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_wait_for_commit(fs_info, t->transid);
btrfs_put_transaction(t);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
continue;
}
- if (t == root->fs_info->running_transaction) {
+ if (t == fs_info->running_transaction) {
t->state = TRANS_STATE_COMMIT_DOING;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
/*
* We wait for 0 num_writers since we don't hold a trans
* handle open currently for this transaction.
@@ -4629,27 +4636,27 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
wait_event(t->writer_wait,
atomic_read(&t->num_writers) == 0);
} else {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
}
- btrfs_cleanup_one_transaction(t, root);
+ btrfs_cleanup_one_transaction(t, fs_info);
- spin_lock(&root->fs_info->trans_lock);
- if (t == root->fs_info->running_transaction)
- root->fs_info->running_transaction = NULL;
+ spin_lock(&fs_info->trans_lock);
+ if (t == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
list_del_init(&t->list);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
- trace_btrfs_transaction_commit(root);
- spin_lock(&root->fs_info->trans_lock);
- }
- spin_unlock(&root->fs_info->trans_lock);
- btrfs_destroy_all_ordered_extents(root->fs_info);
- btrfs_destroy_delayed_inodes(root);
- btrfs_assert_delayed_root_empty(root);
- btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
- btrfs_destroy_all_delalloc_inodes(root->fs_info);
- mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ trace_btrfs_transaction_commit(fs_info->tree_root);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+ btrfs_destroy_all_ordered_extents(fs_info);
+ btrfs_destroy_delayed_inodes(fs_info);
+ btrfs_assert_delayed_root_empty(fs_info);
+ btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
+ btrfs_destroy_all_delalloc_inodes(fs_info);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 1a3237e5700f..44dcd9af6b7c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -44,27 +44,26 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
- u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 parent_transid);
+void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
-struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr);
+struct extent_buffer *btrfs_find_create_tree_block(
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-void close_ctree(struct btrfs_root *root);
+void close_ctree(struct btrfs_fs_info *fs_info);
int write_ctree_super(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int max_mirrors);
+ struct btrfs_fs_info *fs_info, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
struct buffer_head **bh_ret);
-int btrfs_commit_super(struct btrfs_root *root);
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr);
+int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
@@ -85,15 +84,14 @@ btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root);
-void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
+void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
- u32 sectorsize, u32 nodesize);
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
/*
@@ -121,7 +119,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
-void btrfs_csum_final(u32 crc, char *result);
+void btrfs_csum_final(u32 crc, u8 *result);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -137,9 +135,9 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
- struct btrfs_root *root);
+ struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 objectid);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 2513a7f53334..340d90751263 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -153,6 +153,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -169,7 +170,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
key.objectid = root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
- root = root->fs_info->tree_root;
+ root = fs_info->tree_root;
} else {
key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
@@ -205,13 +206,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
btrfs_free_path(path);
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
- return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ return btrfs_get_dentry(fs_info->sb, key.objectid,
found_key.offset, 0, 0);
}
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+ return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -222,6 +223,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
{
struct inode *inode = d_inode(child);
struct inode *dir = d_inode(parent);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode_ref *iref;
@@ -250,7 +252,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
- root = root->fs_info->tree_root;
+ root = fs_info->tree_root;
} else {
key.objectid = ino;
key.offset = btrfs_ino(dir);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4607af38c72e..dcd2e798767e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -61,10 +61,10 @@ enum {
};
static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
@@ -73,17 +73,17 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 flags,
+ struct btrfs_fs_info *fs_info, u64 flags,
int force);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
@@ -96,8 +96,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_pin_extent(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int reserved);
static int __reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_space_info *space_info,
u64 orig_bytes,
@@ -223,18 +221,18 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
return ret;
}
-static int add_excluded_extent(struct btrfs_root *root,
+static int add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&root->fs_info->freed_extents[0],
+ set_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
- set_extent_bits(&root->fs_info->freed_extents[1],
+ set_extent_bits(&fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE);
return 0;
}
-static void free_excluded_extents(struct btrfs_root *root,
+static void free_excluded_extents(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *cache)
{
u64 start, end;
@@ -242,13 +240,13 @@ static void free_excluded_extents(struct btrfs_root *root,
start = cache->key.objectid;
end = start + cache->key.offset - 1;
- clear_extent_bits(&root->fs_info->freed_extents[0],
+ clear_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
- clear_extent_bits(&root->fs_info->freed_extents[1],
+ clear_extent_bits(&fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE);
}
-static int exclude_super_stripes(struct btrfs_root *root,
+static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *cache)
{
u64 bytenr;
@@ -259,7 +257,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
cache->bytes_super += stripe_len;
- ret = add_excluded_extent(root, cache->key.objectid,
+ ret = add_excluded_extent(fs_info, cache->key.objectid,
stripe_len);
if (ret)
return ret;
@@ -267,7 +265,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(root->fs_info, cache->key.objectid,
+ ret = btrfs_rmap_block(fs_info, cache->key.objectid,
bytenr, 0, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -293,7 +291,7 @@ static int exclude_super_stripes(struct btrfs_root *root,
}
cache->bytes_super += len;
- ret = add_excluded_extent(root, start, len);
+ ret = add_excluded_extent(fs_info, start, len);
if (ret) {
kfree(logical);
return ret;
@@ -329,13 +327,13 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
}
#ifdef CONFIG_BTRFS_DEBUG
-static void fragment_free_space(struct btrfs_root *root,
- struct btrfs_block_group_cache *block_group)
+static void fragment_free_space(struct btrfs_block_group_cache *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
u64 start = block_group->key.objectid;
u64 len = block_group->key.offset;
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
- root->nodesize : root->sectorsize;
+ fs_info->nodesize : fs_info->sectorsize;
u64 step = chunk << 1;
while (len > chunk) {
@@ -394,9 +392,9 @@ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group;
- struct btrfs_fs_info *fs_info;
- struct btrfs_root *extent_root;
+ struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -406,10 +404,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
int ret;
bool wakeup = true;
- block_group = caching_ctl->block_group;
- fs_info = block_group->fs_info;
- extent_root = fs_info->extent_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -422,7 +416,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
* allocate from this block group until we've had a chance to fragment
* the free space.
*/
- if (btrfs_should_fragment_free_space(extent_root, block_group))
+ if (btrfs_should_fragment_free_space(block_group))
wakeup = false;
#endif
/*
@@ -510,7 +504,7 @@ next:
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
- fs_info->tree_root->nodesize;
+ fs_info->nodesize;
else
last = key.objectid + key.offset;
@@ -561,7 +555,7 @@ static noinline void caching_thread(struct btrfs_work *work)
spin_unlock(&block_group->lock);
#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ if (btrfs_should_fragment_free_space(block_group)) {
u64 bytes_used;
spin_lock(&block_group->space_info->lock);
@@ -571,14 +565,14 @@ static noinline void caching_thread(struct btrfs_work *work)
block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
spin_unlock(&block_group->space_info->lock);
- fragment_free_space(extent_root, block_group);
+ fragment_free_space(block_group);
}
#endif
caching_ctl->progress = (u64)-1;
up_read(&fs_info->commit_root_sem);
- free_excluded_extents(fs_info->extent_root, block_group);
+ free_excluded_extents(fs_info, block_group);
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -668,8 +662,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
spin_unlock(&cache->lock);
#ifdef CONFIG_BTRFS_DEBUG
if (ret == 1 &&
- btrfs_should_fragment_free_space(fs_info->extent_root,
- cache)) {
+ btrfs_should_fragment_free_space(cache)) {
u64 bytes_used;
spin_lock(&cache->space_info->lock);
@@ -679,7 +672,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- fragment_free_space(fs_info->extent_root, cache);
+ fragment_free_space(cache);
}
#endif
mutex_unlock(&caching_ctl->mutex);
@@ -687,7 +680,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
wake_up(&caching_ctl->wait);
if (ret == 1) {
put_caching_control(caching_ctl);
- free_excluded_extents(fs_info->extent_root, cache);
+ free_excluded_extents(fs_info, cache);
return 0;
}
} else {
@@ -778,7 +771,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
}
/* simple helper to search for an existing data extent at a given offset */
-int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
+int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
@@ -791,8 +784,7 @@ int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
key.objectid = start;
key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
- 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
@@ -807,7 +799,7 @@ int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
* the delayed refs are not processed.
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags)
{
struct btrfs_delayed_ref_head *head;
@@ -825,8 +817,8 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
* If we don't have skinny metadata, don't bother doing anything
* different
*/
- if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
- offset = root->nodesize;
+ if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ offset = fs_info->nodesize;
metadata = 0;
}
@@ -847,8 +839,7 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(trans, root->fs_info->extent_root,
- &key, path, 0, 0);
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -859,7 +850,7 @@ search_again:
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
- key.offset == root->nodesize)
+ key.offset == fs_info->nodesize)
ret = 0;
}
}
@@ -1101,7 +1092,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
return ret;
BUG_ON(ret); /* Corruption */
- btrfs_extend_item(root, path, new_size);
+ btrfs_extend_item(root->fs_info, path, new_size);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1114,7 +1105,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
BTRFS_BLOCK_FLAG_FULL_BACKREF);
bi = (struct btrfs_tree_block_info *)(item + 1);
/* FIXME: get first key of the block */
- memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+ memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
btrfs_set_tree_block_level(leaf, bi, (int)owner);
} else {
btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
@@ -1540,6 +1531,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int insert)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1553,8 +1545,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
int want;
int ret;
int err = 0;
- bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
- SKINNY_METADATA);
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1748,7 +1739,7 @@ void setup_inline_extent_backref(struct btrfs_root *root,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- btrfs_extend_item(root, path, size);
+ btrfs_extend_item(root->fs_info, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1875,7 +1866,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- btrfs_truncate_item(root, path, item_size, 1);
+ btrfs_truncate_item(root->fs_info, path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
}
@@ -2022,7 +2013,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
-int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
int ret;
@@ -2034,10 +2025,10 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
* Avoid races with device replace and make sure our bbio has devices
* associated to its stripes that don't go away while we are discarding.
*/
- btrfs_bio_counter_inc_blocked(root->fs_info);
+ btrfs_bio_counter_inc_blocked(fs_info);
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
- bytenr, &num_bytes, &bbio, 0);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
+ &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
struct btrfs_bio_stripe *stripe = bbio->stripes;
@@ -2067,7 +2058,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
}
btrfs_put_bbio(bbio);
}
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
if (actual_bytes)
*actual_bytes = discarded_bytes;
@@ -2080,12 +2071,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -2105,13 +2095,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
}
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
struct btrfs_delayed_extent_op *extent_op)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
@@ -2154,7 +2143,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
- ret = insert_extent_backref(trans, root->fs_info->extent_root,
+ ret = insert_extent_backref(trans, fs_info->extent_root,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
if (ret)
@@ -2165,7 +2154,7 @@ out:
}
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
@@ -2182,7 +2171,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
+ trace_run_delayed_data_ref(fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
@@ -2191,17 +2180,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
if (extent_op)
flags |= extent_op->flags_to_set;
- ret = alloc_reserved_file_extent(trans, root,
+ ret = alloc_reserved_file_extent(trans, fs_info,
parent, ref_root, flags,
ref->objectid, ref->offset,
&ins, node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node, parent,
+ ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node, parent,
+ ret = __btrfs_free_extent(trans, fs_info, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
extent_op);
@@ -2230,7 +2219,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
@@ -2246,7 +2235,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (trans->aborted)
return 0;
- if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@@ -2266,8 +2255,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
path->reada = READA_FORWARD;
path->leave_spinning = 1;
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
- path, 0, 1);
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
goto out;
@@ -2302,7 +2290,7 @@ again:
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (item_size < sizeof(*ei)) {
- ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+ ret = convert_extent_item_v0(trans, fs_info->extent_root,
path, (u64)-1, 0);
if (ret < 0) {
err = ret;
@@ -2323,7 +2311,7 @@ out:
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
@@ -2333,11 +2321,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 parent = 0;
u64 ref_root = 0;
- bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
- SKINNY_METADATA);
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
+ trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
@@ -2353,7 +2340,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (node->ref_mod != 1) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
@@ -2361,18 +2348,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
- ret = alloc_reserved_tree_block(trans, root,
+ ret = alloc_reserved_tree_block(trans, fs_info,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
ref->level, &ins);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node,
+ ret = __btrfs_inc_extent_ref(trans, fs_info, node,
parent, ref_root,
ref->level, 0, 1,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node,
+ ret = __btrfs_free_extent(trans, fs_info, node,
parent, ref_root,
ref->level, 0, 1, extent_op);
} else {
@@ -2383,7 +2370,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
@@ -2392,7 +2379,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
if (trans->aborted) {
if (insert_reserved)
- btrfs_pin_extent(root, node->bytenr,
+ btrfs_pin_extent(fs_info, node->bytenr,
node->num_bytes, 1);
return 0;
}
@@ -2407,33 +2394,31 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
*/
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
- trace_run_delayed_ref_head(root->fs_info, node, head,
- node->action);
+ trace_run_delayed_ref_head(fs_info, node, head, node->action);
if (insert_reserved) {
- btrfs_pin_extent(root, node->bytenr,
+ btrfs_pin_extent(fs_info, node->bytenr,
node->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, root,
+ ret = btrfs_del_csums(trans, fs_info,
node->bytenr,
node->num_bytes);
}
}
/* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(root->fs_info,
- head->qgroup_ref_root,
+ btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
head->qgroup_reserved);
return ret;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = run_delayed_tree_ref(trans, root, node, extent_op,
+ ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
insert_reserved);
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
- ret = run_delayed_data_ref(trans, root, node, extent_op,
+ ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
insert_reserved);
else
BUG();
@@ -2454,13 +2439,14 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
* the extent item from the extent tree, when there still are references
* to add, which would fail because they would not find the extent item.
*/
- list_for_each_entry(ref, &head->ref_list, list) {
- if (ref->action == BTRFS_ADD_DELAYED_REF)
- return ref;
- }
+ if (!list_empty(&head->ref_add_list))
+ return list_first_entry(&head->ref_add_list,
+ struct btrfs_delayed_ref_node, add_list);
- return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
- list);
+ ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ ASSERT(list_empty(&ref->add_list));
+ return ref;
}
/*
@@ -2468,14 +2454,13 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
unsigned long nr)
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
- struct btrfs_fs_info *fs_info = root->fs_info;
ktime_t start = ktime_get();
int ret;
unsigned long count = 0;
@@ -2537,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (ref && ref->seq &&
btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
spin_unlock(&locked_ref->lock);
- btrfs_delayed_ref_unlock(locked_ref);
spin_lock(&delayed_refs->lock);
locked_ref->processing = 0;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
cond_resched();
count++;
@@ -2574,7 +2559,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (extent_op) {
spin_unlock(&locked_ref->lock);
- ret = run_delayed_extent_op(trans, root,
+ ret = run_delayed_extent_op(trans, fs_info,
ref, extent_op);
btrfs_free_delayed_extent_op(extent_op);
@@ -2587,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
*/
if (must_insert_reserved)
locked_ref->must_insert_reserved = 1;
+ spin_lock(&delayed_refs->lock);
locked_ref->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
btrfs_debug(fs_info,
"run_delayed_extent_op returned %d",
ret);
@@ -2620,6 +2608,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
actual_count++;
ref->in_tree = 0;
list_del(&ref->list);
+ if (!list_empty(&ref->add_list))
+ list_del(&ref->add_list);
}
atomic_dec(&delayed_refs->num_entries);
@@ -2642,7 +2632,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
spin_unlock(&locked_ref->lock);
- ret = run_one_delayed_ref(trans, root, ref, extent_op,
+ ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
must_insert_reserved);
btrfs_free_delayed_extent_op(extent_op);
@@ -2743,43 +2733,43 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
{
u64 num_bytes;
num_bytes = heads * (sizeof(struct btrfs_extent_item) +
sizeof(struct btrfs_extent_inline_ref));
- if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
num_bytes += heads * sizeof(struct btrfs_tree_block_info);
/*
* We don't ever fill up leaves all the way so multiply by 2 just to be
* closer to what we're really going to want to use.
*/
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
}
/*
* Takes the number of bytes to be csumm'ed and figures out how many leaves it
* would require to store the csums for that many bytes.
*/
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
{
u64 csum_size;
u64 num_csums_per_leaf;
u64 num_csums;
- csum_size = BTRFS_MAX_ITEM_SIZE(root);
+ csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
num_csums_per_leaf = div64_u64(csum_size,
- (u64)btrfs_super_csum_size(root->fs_info->super_copy));
- num_csums = div64_u64(csum_bytes, root->sectorsize);
+ (u64)btrfs_super_csum_size(fs_info->super_copy));
+ num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
num_csums += num_csums_per_leaf - 1;
num_csums = div64_u64(num_csums, num_csums_per_leaf);
return num_csums;
}
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
@@ -2788,15 +2778,16 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
- num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- num_heads = heads_to_leaves(root, num_heads);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+ num_heads = heads_to_leaves(fs_info, num_heads);
if (num_heads > 1)
- num_bytes += (num_heads - 1) * root->nodesize;
+ num_bytes += (num_heads - 1) * fs_info->nodesize;
num_bytes <<= 1;
- num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+ num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
+ fs_info->nodesize;
+ num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
num_dirty_bgs);
- global_rsv = &root->fs_info->global_block_rsv;
+ global_rsv = &fs_info->global_block_rsv;
/*
* If we can't allocate any more chunks lets make sure we have _lots_ of
@@ -2815,9 +2806,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
u64 avg_runtime;
@@ -2826,12 +2816,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
smp_mb();
avg_runtime = fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
- if (num_entries * avg_runtime >= NSEC_PER_SEC)
+ if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans, root);
+ return btrfs_check_space_for_delayed_refs(trans, fs_info);
}
struct async_delayed_refs {
@@ -2844,16 +2834,21 @@ struct async_delayed_refs {
struct btrfs_work work;
};
+static inline struct async_delayed_refs *
+to_async_delayed_refs(struct btrfs_work *work)
+{
+ return container_of(work, struct async_delayed_refs, work);
+}
+
static void delayed_ref_async_start(struct btrfs_work *work)
{
- struct async_delayed_refs *async;
+ struct async_delayed_refs *async = to_async_delayed_refs(work);
struct btrfs_trans_handle *trans;
+ struct btrfs_fs_info *fs_info = async->root->fs_info;
int ret;
- async = container_of(work, struct async_delayed_refs, work);
-
/* if the commit is already started, we don't need to wait here */
- if (btrfs_transaction_blocked(async->root->fs_info))
+ if (btrfs_transaction_blocked(fs_info))
goto done;
trans = btrfs_join_transaction(async->root);
@@ -2872,11 +2867,11 @@ static void delayed_ref_async_start(struct btrfs_work *work)
if (trans->transid > async->transid)
goto end;
- ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+ ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
if (ret)
async->error = ret;
end:
- ret = btrfs_end_transaction(trans, async->root);
+ ret = btrfs_end_transaction(trans);
if (ret && !async->error)
async->error = ret;
done:
@@ -2886,7 +2881,7 @@ done:
kfree(async);
}
-int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait)
{
struct async_delayed_refs *async;
@@ -2896,7 +2891,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
if (!async)
return -ENOMEM;
- async->root = root->fs_info->tree_root;
+ async->root = fs_info->tree_root;
async->count = count;
async->error = 0;
async->transid = transid;
@@ -2909,7 +2904,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
btrfs_init_work(&async->work, btrfs_extent_refs_helper,
delayed_ref_async_start, NULL, NULL);
- btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+ btrfs_queue_work(fs_info->extent_workers, &async->work);
if (wait) {
wait_for_completion(&async->wait);
@@ -2931,7 +2926,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long count)
+ struct btrfs_fs_info *fs_info, unsigned long count)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -2944,12 +2939,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (trans->aborted)
return 0;
- if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &root->fs_info->flags))
+ if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
return 0;
- if (root == root->fs_info->extent_root)
- root = root->fs_info->tree_root;
-
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
count = atomic_read(&delayed_refs->num_entries) * 2;
@@ -2959,7 +2951,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
trans->can_flush_pending_bgs = false;
- ret = __btrfs_run_delayed_refs(trans, root, count);
+ ret = __btrfs_run_delayed_refs(trans, fs_info, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -2967,7 +2959,7 @@ again:
if (run_all) {
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
+ btrfs_create_pending_block_groups(trans, fs_info);
spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->href_root);
@@ -3012,7 +3004,7 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 flags,
int level, int is_data)
{
@@ -3029,7 +3021,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+ ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
num_bytes, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
@@ -3103,7 +3095,8 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 objectid, u64 offset, u64 bytenr)
{
- struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref;
struct btrfs_extent_inline_ref *iref;
@@ -3210,6 +3203,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
int full_backref, int inc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 num_bytes;
u64 parent;
@@ -3220,11 +3214,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int i;
int level;
int ret = 0;
- int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+ int (*process_func)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
u64, u64, u64, u64, u64, u64);
- if (btrfs_is_testing(root->fs_info))
+ if (btrfs_is_testing(fs_info))
return 0;
ref_root = btrfs_header_owner(buf);
@@ -3260,15 +3255,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
- ret = process_func(trans, root, bytenr, num_bytes,
+ ret = process_func(trans, fs_info, bytenr, num_bytes,
parent, ref_root, key.objectid,
key.offset);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
- num_bytes = root->nodesize;
- ret = process_func(trans, root, bytenr, num_bytes,
+ num_bytes = fs_info->nodesize;
+ ret = process_func(trans, fs_info, bytenr, num_bytes,
parent, ref_root, level - 1, 0);
if (ret)
goto fail;
@@ -3292,12 +3287,12 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_block_group_cache *cache)
{
int ret;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_root *extent_root = fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
@@ -3319,22 +3314,20 @@ fail:
}
static struct btrfs_block_group_cache *
-next_block_group(struct btrfs_root *root,
+next_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *cache)
{
struct rb_node *node;
- spin_lock(&root->fs_info->block_group_cache_lock);
+ spin_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->key.objectid + cache->key.offset;
- spin_unlock(&root->fs_info->block_group_cache_lock);
+ spin_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(root->fs_info,
- next_bytenr);
- return cache;
+ cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -3344,7 +3337,7 @@ next_block_group(struct btrfs_root *root,
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&root->fs_info->block_group_cache_lock);
+ spin_unlock(&fs_info->block_group_cache_lock);
return cache;
}
@@ -3352,7 +3345,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
- struct btrfs_root *root = block_group->fs_info->tree_root;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
@@ -3425,8 +3419,8 @@ again:
WARN_ON(ret);
if (i_size_read(inode) > 0) {
- ret = btrfs_check_trunc_cache_free_space(root,
- &root->fs_info->global_block_rsv);
+ ret = btrfs_check_trunc_cache_free_space(fs_info,
+ &fs_info->global_block_rsv);
if (ret)
goto out_put;
@@ -3437,7 +3431,7 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
+ !btrfs_test_opt(fs_info, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3506,14 +3500,14 @@ out:
}
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_path *path;
if (list_empty(&cur_trans->dirty_bgs) ||
- !btrfs_test_opt(root->fs_info, SPACE_CACHE))
+ !btrfs_test_opt(fs_info, SPACE_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -3544,7 +3538,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
* we're still allowing others to join the commit.
*/
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
@@ -3569,7 +3563,7 @@ again:
* make sure all the block groups on our dirty list actually
* exist
*/
- btrfs_create_pending_block_groups(trans, root);
+ btrfs_create_pending_block_groups(trans, fs_info);
if (!path) {
path = btrfs_alloc_path();
@@ -3594,9 +3588,7 @@ again:
*/
if (!list_empty(&cache->io_list)) {
list_del_init(&cache->io_list);
- btrfs_wait_cache_io(root, trans, cache,
- &cache->io_ctl, path,
- cache->key.objectid);
+ btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
}
@@ -3619,7 +3611,8 @@ again:
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
- ret = btrfs_write_out_cache(root, trans, cache, path);
+ ret = btrfs_write_out_cache(fs_info, trans,
+ cache, path);
if (ret == 0 && cache->io_ctl.inode) {
num_started++;
should_put = 0;
@@ -3638,7 +3631,8 @@ again:
}
}
if (!ret) {
- ret = write_one_cache_group(trans, root, path, cache);
+ ret = write_one_cache_group(trans, fs_info,
+ path, cache);
/*
* Our block group might still be attached to the list
* of new block groups in the transaction handle of some
@@ -3683,7 +3677,7 @@ again:
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, root, 0);
+ ret = btrfs_run_delayed_refs(trans, fs_info, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3698,7 +3692,7 @@ again:
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret < 0) {
- btrfs_cleanup_dirty_bgs(cur_trans, root);
+ btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
btrfs_free_path(path);
@@ -3706,7 +3700,7 @@ again:
}
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
@@ -3749,9 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
- btrfs_wait_cache_io(root, trans, cache,
- &cache->io_ctl, path,
- cache->key.objectid);
+ btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
spin_lock(&cur_trans->dirty_bgs_lock);
}
@@ -3767,11 +3759,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+ ret = btrfs_run_delayed_refs(trans, fs_info,
+ (unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
- ret = btrfs_write_out_cache(root, trans, cache, path);
+ ret = btrfs_write_out_cache(fs_info, trans,
+ cache, path);
if (ret == 0 && cache->io_ctl.inode) {
num_started++;
should_put = 0;
@@ -3785,7 +3779,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
}
if (!ret) {
- ret = write_one_cache_group(trans, root, path, cache);
+ ret = write_one_cache_group(trans, fs_info,
+ path, cache);
/*
* One of the free space endio workers might have
* created a new block group while updating a free space
@@ -3802,8 +3797,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
if (ret == -ENOENT) {
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- ret = write_one_cache_group(trans, root, path,
- cache);
+ ret = write_one_cache_group(trans, fs_info,
+ path, cache);
}
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -3820,8 +3815,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
list_del_init(&cache->io_list);
- btrfs_wait_cache_io(root, trans, cache,
- &cache->io_ctl, path, cache->key.objectid);
+ btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
}
@@ -3829,12 +3823,12 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group_cache *block_group;
int readonly = 0;
- block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
if (!block_group || block_group->ro)
readonly = 1;
if (block_group)
@@ -4043,9 +4037,9 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
* progress (either running or paused) picks the target profile (if it's
* already available), otherwise falls back to plain reducing.
*/
-static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 num_devices = root->fs_info->fs_devices->rw_devices;
+ u64 num_devices = fs_info->fs_devices->rw_devices;
u64 target;
u64 raid_type;
u64 allowed = 0;
@@ -4054,16 +4048,16 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
* see if restripe for this chunk_type is in progress, if so
* try to reduce to the target profile
*/
- spin_lock(&root->fs_info->balance_lock);
- target = get_restripe_target(root->fs_info, flags);
+ spin_lock(&fs_info->balance_lock);
+ target = get_restripe_target(fs_info, flags);
if (target) {
/* pick target profile only if it's already available */
if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&root->fs_info->balance_lock);
+ spin_unlock(&fs_info->balance_lock);
return extended_to_chunk(target);
}
}
- spin_unlock(&root->fs_info->balance_lock);
+ spin_unlock(&fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
@@ -4088,39 +4082,40 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return extended_to_chunk(flags | allowed);
}
-static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
+static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
do {
flags = orig_flags;
- seq = read_seqbegin(&root->fs_info->profiles_lock);
+ seq = read_seqbegin(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits;
+ flags |= fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits;
+ flags |= fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits;
- } while (read_seqretry(&root->fs_info->profiles_lock, seq));
+ flags |= fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&fs_info->profiles_lock, seq));
- return btrfs_reduce_alloc_profile(root, flags);
+ return btrfs_reduce_alloc_profile(fs_info, flags);
}
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 flags;
u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
- else if (root == root->fs_info->chunk_root)
+ else if (root == fs_info->chunk_root)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = get_alloc_profile(root, flags);
+ ret = get_alloc_profile(fs_info, flags);
return ret;
}
@@ -4135,7 +4130,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
int have_pinned_space;
/* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ bytes = ALIGN(bytes, fs_info->sectorsize);
if (btrfs_is_free_space_inode(inode)) {
need_commit = 0;
@@ -4181,10 +4176,9 @@ alloc:
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- alloc_target,
+ ret = do_chunk_alloc(trans, fs_info, alloc_target,
CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret < 0) {
if (ret != -ENOSPC)
return ret;
@@ -4213,12 +4207,13 @@ alloc:
/* commit the current transaction and try again */
commit_trans:
if (need_commit &&
- !atomic_read(&root->fs_info->open_ioctl_trans)) {
+ !atomic_read(&fs_info->open_ioctl_trans)) {
need_commit--;
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, 0, -1);
- btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, -1, 0,
+ (u64)-1);
}
trans = btrfs_join_transaction(root);
@@ -4228,7 +4223,7 @@ commit_trans:
test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
&trans->transaction->flags) ||
need_commit > 0) {
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
/*
@@ -4236,21 +4231,21 @@ commit_trans:
* operations. Wait for it to finish so that
* more space is released.
*/
- mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
- mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
+ mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
goto again;
} else {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
}
}
- trace_btrfs_space_reservation(root->fs_info,
+ trace_btrfs_space_reservation(fs_info,
"space_info:enospc",
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
- trace_btrfs_space_reservation(root->fs_info, "space_info",
+ trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
@@ -4264,13 +4259,13 @@ commit_trans:
*/
int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
/* align the range */
- len = round_up(start + len, root->sectorsize) -
- round_down(start, root->sectorsize);
- start = round_down(start, root->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
@@ -4294,21 +4289,21 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
/* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->sectorsize) -
- round_down(start, root->sectorsize);
- start = round_down(start, root->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
- data_sinfo = root->fs_info->data_sinfo;
+ data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
if (WARN_ON(data_sinfo->bytes_may_use < len))
data_sinfo->bytes_may_use = 0;
else
data_sinfo->bytes_may_use -= len;
- trace_btrfs_space_reservation(root->fs_info, "space_info",
+ trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
@@ -4322,6 +4317,13 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
*/
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->fs_info->sectorsize) -
+ round_down(start, root->fs_info->sectorsize);
+ start = round_down(start, root->fs_info->sectorsize);
+
btrfs_free_reserved_data_space_noquota(inode, start, len);
btrfs_qgroup_free_data(inode, start, len);
}
@@ -4344,10 +4346,10 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
return (global->size << 1);
}
-static int should_alloc_chunk(struct btrfs_root *root,
+static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
u64 thresh;
@@ -4368,7 +4370,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
* about 1% of the FS size.
*/
if (force == CHUNK_ALLOC_LIMITED) {
- thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+ thresh = btrfs_super_total_bytes(fs_info->super_copy);
thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
@@ -4380,7 +4382,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
@@ -4388,7 +4390,7 @@ static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6))
- num_dev = root->fs_info->fs_devices->rw_devices;
+ num_dev = fs_info->fs_devices->rw_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_dev = 2;
else
@@ -4403,8 +4405,7 @@ static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
* removing a chunk.
*/
void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 type)
+ struct btrfs_fs_info *fs_info, u64 type)
{
struct btrfs_space_info *info;
u64 left;
@@ -4416,43 +4417,43 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
- ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
+ ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
- info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
info->bytes_reserved - info->bytes_readonly -
info->bytes_may_use;
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(root, type);
+ num_devs = get_profile_num_devs(fs_info, type);
/* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
- btrfs_calc_trans_metadata_size(root, 1);
+ thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_trans_metadata_size(fs_info, 1);
- if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
- btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
- dump_space_info(root->fs_info, info, 0, 0);
+ if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
+ left, thresh, type);
+ dump_space_info(fs_info, info, 0, 0);
}
if (left < thresh) {
u64 flags;
- flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+ flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- ret = btrfs_alloc_chunk(trans, root, flags);
+ ret = btrfs_alloc_chunk(trans, fs_info, flags);
}
if (!ret) {
- ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
- &root->fs_info->chunk_block_rsv,
+ ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ &fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
trans->chunk_bytes_reserved += thresh;
@@ -4469,10 +4470,9 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* - return errors including -ENOSPC otherwise.
*/
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 flags, int force)
+ struct btrfs_fs_info *fs_info, u64 flags, int force)
{
struct btrfs_space_info *space_info;
- struct btrfs_fs_info *fs_info = extent_root->fs_info;
int wait_for_alloc = 0;
int ret = 0;
@@ -4480,10 +4480,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
if (trans->allocating_chunk)
return -ENOSPC;
- space_info = __find_space_info(extent_root->fs_info, flags);
+ space_info = __find_space_info(fs_info, flags);
if (!space_info) {
- ret = update_space_info(extent_root->fs_info, flags,
- 0, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
BUG_ON(ret); /* -ENOMEM */
}
BUG_ON(!space_info); /* Logic error */
@@ -4493,7 +4492,7 @@ again:
if (force < space_info->force_alloc)
force = space_info->force_alloc;
if (space_info->full) {
- if (should_alloc_chunk(extent_root, space_info, force))
+ if (should_alloc_chunk(fs_info, space_info, force))
ret = -ENOSPC;
else
ret = 0;
@@ -4501,7 +4500,7 @@ again:
return ret;
}
- if (!should_alloc_chunk(extent_root, space_info, force)) {
+ if (!should_alloc_chunk(fs_info, space_info, force)) {
spin_unlock(&space_info->lock);
return 0;
} else if (space_info->chunk_alloc) {
@@ -4551,9 +4550,9 @@ again:
* Check if we have enough space in SYSTEM chunk because we may need
* to update devices.
*/
- check_system_chunk(trans, extent_root, flags);
+ check_system_chunk(trans, fs_info, flags);
- ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ ret = btrfs_alloc_chunk(trans, fs_info, flags);
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
@@ -4585,7 +4584,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, extent_root);
+ btrfs_create_pending_block_groups(trans, fs_info);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4595,7 +4594,8 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_block_rsv *global_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 profile;
u64 space_size;
u64 avail;
@@ -4605,8 +4605,6 @@ static int can_overcommit(struct btrfs_root *root,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
- BUG_ON(root->fs_info == NULL);
- global_rsv = &root->fs_info->global_block_rsv;
profile = btrfs_get_alloc_profile(root, 0);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly;
@@ -4625,9 +4623,9 @@ static int can_overcommit(struct btrfs_root *root,
used += space_info->bytes_may_use;
- spin_lock(&root->fs_info->free_chunk_lock);
- avail = root->fs_info->free_chunk_space;
- spin_unlock(&root->fs_info->free_chunk_lock);
+ spin_lock(&fs_info->free_chunk_lock);
+ avail = fs_info->free_chunk_space;
+ spin_unlock(&fs_info->free_chunk_lock);
/*
* If we have dup, raid1 or raid10 then only half of the free
@@ -4655,10 +4653,10 @@ static int can_overcommit(struct btrfs_root *root,
return 0;
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
unsigned long nr_pages, int nr_items)
{
- struct super_block *sb = root->fs_info->sb;
+ struct super_block *sb = fs_info->sb;
if (down_read_trylock(&sb->s_umount)) {
writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
@@ -4671,19 +4669,19 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
+ btrfs_start_delalloc_roots(fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_ordered_roots(root->fs_info, nr_items,
- 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
}
}
-static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+ u64 to_reclaim)
{
u64 bytes;
int nr;
- bytes = btrfs_calc_trans_metadata_size(root, 1);
+ bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
nr = (int)div64_u64(to_reclaim, bytes);
if (!nr)
nr = 1;
@@ -4698,6 +4696,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
bool wait_ordered)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -4710,21 +4709,20 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(root, to_reclaim);
+ items = calc_reclaim_items_nr(fs_info, to_reclaim);
to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
trans = (struct btrfs_trans_handle *)current->journal_info;
- block_rsv = &root->fs_info->delalloc_block_rsv;
+ block_rsv = &fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
delalloc_bytes = percpu_counter_sum_positive(
- &root->fs_info->delalloc_bytes);
+ &fs_info->delalloc_bytes);
if (delalloc_bytes == 0) {
if (trans)
return;
if (wait_ordered)
- btrfs_wait_ordered_roots(root->fs_info, items,
- 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
return;
}
@@ -4732,12 +4730,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_SHIFT;
- btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
+ btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
* we do anything.
*/
- max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+ max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
if (!max_reclaim)
goto skip_async;
@@ -4746,8 +4744,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
else
max_reclaim -= nr_pages;
- wait_event(root->fs_info->async_submit_wait,
- atomic_read(&root->fs_info->async_delalloc_pages) <=
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
skip_async:
if (!trans)
@@ -4768,15 +4766,14 @@ skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(root->fs_info, items,
- 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
break;
}
delalloc_bytes = percpu_counter_sum_positive(
- &root->fs_info->delalloc_bytes);
+ &fs_info->delalloc_bytes);
}
}
@@ -4794,7 +4791,8 @@ static int may_commit_transaction(struct btrfs_root *root,
struct btrfs_space_info *space_info,
u64 bytes, int force)
{
- struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
struct btrfs_trans_handle *trans;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -4829,7 +4827,7 @@ commit:
if (IS_ERR(trans))
return -ENOSPC;
- return btrfs_commit_transaction(trans, root);
+ return btrfs_commit_transaction(trans);
}
struct reserve_ticket {
@@ -4843,6 +4841,7 @@ static int flush_space(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 num_bytes,
u64 orig_bytes, int state)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -4851,7 +4850,7 @@ static int flush_space(struct btrfs_root *root,
case FLUSH_DELAYED_ITEMS_NR:
case FLUSH_DELAYED_ITEMS:
if (state == FLUSH_DELAYED_ITEMS_NR)
- nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+ nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
else
nr = -1;
@@ -4860,8 +4859,8 @@ static int flush_space(struct btrfs_root *root,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_run_delayed_items_nr(trans, root, nr);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+ btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
@@ -4874,10 +4873,10 @@ static int flush_space(struct btrfs_root *root,
ret = PTR_ERR(trans);
break;
}
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ ret = do_chunk_alloc(trans, fs_info,
btrfs_get_alloc_profile(root, 0),
CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -4889,7 +4888,7 @@ static int flush_space(struct btrfs_root *root,
break;
}
- trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
+ trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes,
orig_bytes, state, ret);
return ret;
}
@@ -4935,6 +4934,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
struct btrfs_root *root, u64 used)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
/* If we're just plain full then async reclaim just slows us down. */
@@ -4944,9 +4944,8 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
if (!btrfs_calc_reclaim_metadata_size(root, space_info))
return 0;
- return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING,
- &root->fs_info->fs_state));
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
static void wake_all_tickets(struct list_head *head)
@@ -5126,6 +5125,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -5146,15 +5146,13 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
*/
if (used + orig_bytes <= space_info->total_bytes) {
space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info, "space_info",
- space_info->flags, orig_bytes,
- 1);
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, orig_bytes, 1);
ret = 0;
} else if (can_overcommit(root, space_info, orig_bytes, flush)) {
space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info, "space_info",
- space_info->flags, orig_bytes,
- 1);
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, orig_bytes, 1);
ret = 0;
}
@@ -5173,7 +5171,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
- trace_btrfs_trigger_flush(root->fs_info,
+ trace_btrfs_trigger_flush(fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
@@ -5191,15 +5189,13 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
* which means we won't have fs_info->fs_root set, so don't do
* the async reclaim as we will panic.
*/
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags) &&
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
need_do_async_reclaim(space_info, root, used) &&
- !work_busy(&root->fs_info->async_reclaim_work)) {
- trace_btrfs_trigger_flush(root->fs_info,
- space_info->flags,
- orig_bytes, flush,
- "preempt");
+ !work_busy(&fs_info->async_reclaim_work)) {
+ trace_btrfs_trigger_flush(fs_info, space_info->flags,
+ orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
- &root->fs_info->async_reclaim_work);
+ &fs_info->async_reclaim_work);
}
}
spin_unlock(&space_info->lock);
@@ -5207,19 +5203,19 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(root->fs_info, space_info, &ticket,
+ return wait_reserve_ticket(fs_info, space_info, &ticket,
orig_bytes);
ret = 0;
- priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
+ priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
if (ticket.bytes < orig_bytes) {
u64 num_bytes = orig_bytes - ticket.bytes;
space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags,
- num_bytes, 0);
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ num_bytes, 0);
}
list_del_init(&ticket.list);
@@ -5249,22 +5245,20 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- struct btrfs_block_rsv *global_rsv =
- &root->fs_info->global_block_rsv;
-
if (block_rsv != global_rsv &&
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
if (ret == -ENOSPC)
- trace_btrfs_space_reservation(root->fs_info,
- "space_info:enospc",
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
return ret;
@@ -5274,18 +5268,19 @@ static struct btrfs_block_rsv *get_block_rsv(
const struct btrfs_trans_handle *trans,
const struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- (root == root->fs_info->csum_root && trans->adding_csums) ||
- (root == root->fs_info->uuid_root))
+ (root == fs_info->csum_root && trans->adding_csums) ||
+ (root == fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
block_rsv = root->block_rsv;
if (!block_rsv)
- block_rsv = &root->fs_info->empty_block_rsv;
+ block_rsv = &fs_info->empty_block_rsv;
return block_rsv;
}
@@ -5507,11 +5502,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
rsv->type = type;
}
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
if (!block_rsv)
@@ -5523,12 +5517,12 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
return block_rsv;
}
-void btrfs_free_block_rsv(struct btrfs_root *root,
+void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
if (!rsv)
return;
- btrfs_block_rsv_release(root, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
kfree(rsv);
}
@@ -5555,8 +5549,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_check(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int min_factor)
+int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -5603,16 +5596,16 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-void btrfs_block_rsv_release(struct btrfs_root *root,
+void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
{
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
if (global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
- block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
- num_bytes);
+ block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
}
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5707,7 +5700,7 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
}
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
if (!trans->block_rsv)
return;
@@ -5715,9 +5708,10 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
- trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ trans->bytes_reserved);
trans->bytes_reserved = 0;
}
@@ -5743,6 +5737,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
/*
* We always use trans->block_rsv here as we will have reserved space
@@ -5758,19 +5753,22 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
* added it, so this takes the reservation so we can release it later
* when we are truly done with the orphan item.
*/
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- trace_btrfs_space_reservation(root->fs_info, "orphan",
+ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+
+ trace_btrfs_space_reservation(fs_info, "orphan",
btrfs_ino(inode), num_bytes, 1);
return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
void btrfs_orphan_release_metadata(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- trace_btrfs_space_reservation(root->fs_info, "orphan",
+ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+
+ trace_btrfs_space_reservation(fs_info, "orphan",
btrfs_ino(inode), num_bytes, 0);
- btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+ btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
}
/*
@@ -5795,11 +5793,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
{
u64 num_bytes;
int ret;
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * root->nodesize;
+ num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
@@ -5809,8 +5808,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
*qgroup_reserved = num_bytes;
- num_bytes = btrfs_calc_trans_metadata_size(root, items);
- rsv->space_info = __find_space_info(root->fs_info,
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
+ rsv->space_info = __find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
ret = btrfs_block_rsv_add(root, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
@@ -5824,11 +5823,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
u64 qgroup_reserved)
{
- btrfs_block_rsv_release(root, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
}
/**
@@ -5894,35 +5893,38 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
int reserve)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 old_csums, num_csums;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
BTRFS_I(inode)->csum_bytes == 0)
return 0;
- old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+ old_csums = btrfs_csum_bytes_to_leaves(fs_info,
+ BTRFS_I(inode)->csum_bytes);
if (reserve)
BTRFS_I(inode)->csum_bytes += num_bytes;
else
BTRFS_I(inode)->csum_bytes -= num_bytes;
- num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
+ num_csums = btrfs_csum_bytes_to_leaves(fs_info,
+ BTRFS_I(inode)->csum_bytes);
/* No change, no need to reserve more */
if (old_csums == num_csums)
return 0;
if (reserve)
- return btrfs_calc_trans_metadata_size(root,
+ return btrfs_calc_trans_metadata_size(fs_info,
num_csums - old_csums);
- return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
+ return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
}
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
u64 to_reserve = 0;
u64 csum_bytes;
unsigned nr_extents = 0;
@@ -5949,13 +5951,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
if (flush != BTRFS_RESERVE_NO_FLUSH &&
- btrfs_transaction_in_commit(root->fs_info))
+ btrfs_transaction_in_commit(fs_info))
schedule_timeout(1);
if (delalloc_lock)
mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
- num_bytes = ALIGN(num_bytes, root->sectorsize);
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
nr_extents = (unsigned)div64_u64(num_bytes +
@@ -5970,28 +5972,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
BTRFS_I(inode)->reserved_extents;
/* We always want to reserve a slot for updating the inode. */
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
+ to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) {
+ if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
ret = btrfs_qgroup_reserve_meta(root,
- nr_extents * root->nodesize);
+ nr_extents * fs_info->nodesize);
if (ret)
goto out_fail;
}
ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root,
+ nr_extents * fs_info->nodesize);
goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
&BTRFS_I(inode)->runtime_flags)) {
- to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
+ to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
release_extra = true;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -6001,12 +6004,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), to_reserve, 1);
if (release_extra)
- btrfs_block_rsv_release(root, block_rsv,
- btrfs_calc_trans_metadata_size(root,
- 1));
+ btrfs_block_rsv_release(fs_info, block_rsv,
+ btrfs_calc_trans_metadata_size(fs_info, 1));
return 0;
out_fail:
@@ -6061,11 +6063,11 @@ out_fail:
}
spin_unlock(&BTRFS_I(inode)->lock);
if (dropped)
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
if (to_free) {
- btrfs_block_rsv_release(root, block_rsv, to_free);
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_block_rsv_release(fs_info, block_rsv, to_free);
+ trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
}
if (delalloc_lock)
@@ -6084,11 +6086,11 @@ out_fail:
*/
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 to_free = 0;
unsigned dropped;
- num_bytes = ALIGN(num_bytes, root->sectorsize);
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode, num_bytes);
@@ -6096,16 +6098,15 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(&BTRFS_I(inode)->lock);
if (dropped > 0)
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
- if (btrfs_is_testing(root->fs_info))
+ if (btrfs_is_testing(fs_info))
return;
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
- btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
- to_free);
+ btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
}
/**
@@ -6166,11 +6167,10 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
}
static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_fs_info *info, u64 bytenr,
u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
- struct btrfs_fs_info *info = root->fs_info;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -6211,7 +6211,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -6236,7 +6236,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- trace_btrfs_space_reservation(root->fs_info, "pinned",
+ trace_btrfs_space_reservation(info, "pinned",
cache->space_info->flags,
num_bytes, 1);
set_extent_dirty(info->pinned_extents,
@@ -6276,19 +6276,19 @@ static int update_block_group(struct btrfs_trans_handle *trans,
return 0;
}
-static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
{
struct btrfs_block_group_cache *cache;
u64 bytenr;
- spin_lock(&root->fs_info->block_group_cache_lock);
- bytenr = root->fs_info->first_logical_byte;
- spin_unlock(&root->fs_info->block_group_cache_lock);
+ spin_lock(&fs_info->block_group_cache_lock);
+ bytenr = fs_info->first_logical_byte;
+ spin_unlock(&fs_info->block_group_cache_lock);
if (bytenr < (u64)-1)
return bytenr;
- cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+ cache = btrfs_lookup_first_block_group(fs_info, search_start);
if (!cache)
return 0;
@@ -6298,7 +6298,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-static int pin_down_extent(struct btrfs_root *root,
+static int pin_down_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
@@ -6313,9 +6313,9 @@ static int pin_down_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- trace_btrfs_space_reservation(root->fs_info, "pinned",
+ trace_btrfs_space_reservation(fs_info, "pinned",
cache->space_info->flags, num_bytes, 1);
- set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ set_extent_dirty(fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
@@ -6323,15 +6323,15 @@ static int pin_down_extent(struct btrfs_root *root,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent(struct btrfs_root *root,
+int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group_cache *cache;
- cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+ pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -6340,13 +6340,13 @@ int btrfs_pin_extent(struct btrfs_root *root,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group_cache *cache;
int ret;
- cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -6358,7 +6358,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
*/
cache_block_group(cache, 1);
- pin_down_extent(root, cache, bytenr, num_bytes, 0);
+ pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -6366,13 +6366,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
return ret;
}
-static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 num_bytes)
{
int ret;
struct btrfs_block_group_cache *block_group;
struct btrfs_caching_control *caching_ctl;
- block_group = btrfs_lookup_block_group(root->fs_info, start);
+ block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
@@ -6387,7 +6388,7 @@ static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_b
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
- ret = add_excluded_extent(root, start, num_bytes);
+ ret = add_excluded_extent(fs_info, start, num_bytes);
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
@@ -6401,7 +6402,7 @@ static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_b
num_bytes = (start + num_bytes) -
caching_ctl->progress;
start = caching_ctl->progress;
- ret = add_excluded_extent(root, start, num_bytes);
+ ret = add_excluded_extent(fs_info, start, num_bytes);
}
out_lock:
mutex_unlock(&caching_ctl->mutex);
@@ -6411,7 +6412,7 @@ out_lock:
return ret;
}
-int btrfs_exclude_logged_extents(struct btrfs_root *log,
+int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
struct btrfs_file_extent_item *item;
@@ -6419,7 +6420,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
int found_type;
int i;
- if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
return 0;
for (i = 0; i < btrfs_header_nritems(eb); i++) {
@@ -6434,7 +6435,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
continue;
key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
- __exclude_logged_extent(log, key.objectid, key.offset);
+ __exclude_logged_extent(fs_info, key.objectid, key.offset);
}
return 0;
@@ -6499,16 +6500,9 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
*
- * This is called by the allocator when it reserves space. Metadata
- * reservations should be called with RESERVE_ALLOC so we do the proper
- * ENOSPC accounting. For data we handle the reservation through clearing the
- * delalloc bits in the io_tree. We have to do this since we could end up
- * allocating less disk space for the amount of data we have reserved in the
- * case of compression.
- *
- * If this is a reservation and the block group has become read only we cannot
- * make the reservation and return -EAGAIN, otherwise this function always
- * succeeds.
+ * This is called by the allocator when it reserves space. If this is a
+ * reservation and the block group has become read only we cannot make the
+ * reservation and return -EAGAIN, otherwise this function always succeeds.
*/
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 ram_bytes, u64 num_bytes, int delalloc)
@@ -6568,9 +6562,8 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
return ret;
}
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
@@ -6604,11 +6597,11 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
* what it should be based on the mount options.
*/
static struct btrfs_free_cluster *
-fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
- u64 *empty_cluster)
+fetch_cluster_info(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 *empty_cluster)
{
struct btrfs_free_cluster *ret = NULL;
- bool ssd = btrfs_test_opt(root->fs_info, SSD);
+ bool ssd = btrfs_test_opt(fs_info, SSD);
*empty_cluster = 0;
if (btrfs_mixed_space_info(space_info))
@@ -6617,20 +6610,20 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
if (ssd)
*empty_cluster = SZ_2M;
if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- ret = &root->fs_info->meta_alloc_cluster;
+ ret = &fs_info->meta_alloc_cluster;
if (!ssd)
*empty_cluster = SZ_64K;
} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
- ret = &root->fs_info->data_alloc_cluster;
+ ret = &fs_info->data_alloc_cluster;
}
return ret;
}
-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+static int unpin_extent_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 end,
const bool return_free_space)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -6650,7 +6643,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
- cluster = fetch_cluster_info(root,
+ cluster = fetch_cluster_info(fs_info,
cache->space_info,
&empty_cluster);
empty_cluster <<= 1;
@@ -6729,9 +6722,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
@@ -6753,12 +6745,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
break;
}
- if (btrfs_test_opt(root->fs_info, DISCARD))
- ret = btrfs_discard_extent(root, start,
+ if (btrfs_test_opt(fs_info, DISCARD))
+ ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end);
- unpin_extent_range(root, start, end, true);
+ unpin_extent_range(fs_info, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
@@ -6774,7 +6766,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = -EROFS;
if (!trans->aborted)
- ret = btrfs_discard_extent(root,
+ ret = btrfs_discard_extent(fs_info,
block_group->key.objectid,
block_group->key.offset,
&trimmed);
@@ -6816,7 +6808,7 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *info,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
@@ -6824,7 +6816,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_fs_info *info = root->fs_info;
struct btrfs_root *extent_root = info->extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -6839,8 +6830,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
int last_ref = 0;
- bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
- SKINNY_METADATA);
+ bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
path = btrfs_alloc_path();
if (!path)
@@ -6937,8 +6927,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"umm, got %d back from search, was looking for %llu",
ret, bytenr);
if (ret > 0)
- btrfs_print_leaf(extent_root,
- path->nodes[0]);
+ btrfs_print_leaf(info, path->nodes[0]);
}
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -6947,7 +6936,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
extent_slot = path->slots[0];
}
} else if (WARN_ON(ret == -ENOENT)) {
- btrfs_print_leaf(extent_root, path->nodes[0]);
+ btrfs_print_leaf(info, path->nodes[0]);
btrfs_err(info,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
@@ -6984,7 +6973,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_err(info,
"umm, got %d back from search, was looking for %llu",
ret, bytenr);
- btrfs_print_leaf(extent_root, path->nodes[0]);
+ btrfs_print_leaf(info, path->nodes[0]);
}
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -7040,7 +7029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
- add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+ add_pinned_bytes(info, -num_bytes, owner_objectid,
root_objectid);
} else {
if (found_extent) {
@@ -7065,21 +7054,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (is_data) {
- ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+ ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
- ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
- num_bytes);
+ ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+ ret = update_block_group(trans, info, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7099,7 +7087,7 @@ out:
* removes it from the tree.
*/
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr)
+ u64 bytenr)
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -7169,15 +7157,17 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int pin = 1;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
- buf->start, buf->len,
- parent, root->root_key.objectid,
- btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans,
+ buf->start, buf->len,
+ parent,
+ root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret); /* -ENOMEM */
}
@@ -7188,15 +7178,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, root, buf->start);
+ ret = check_ref_cleanup(trans, buf->start);
if (!ret)
goto out;
}
- cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+ cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(root, cache, buf->start, buf->len, 1);
+ pin_down_extent(fs_info, cache, buf->start,
+ buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -7206,13 +7197,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
pin = 0;
}
out:
if (pin)
- add_pinned_bytes(root->fs_info, buf->len,
- btrfs_header_level(buf),
+ add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
root->root_key.objectid);
/*
@@ -7223,17 +7213,17 @@ out:
}
/* Can return -ENOMEM */
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset)
{
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_is_testing(fs_info))
return 0;
- add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+ add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
/*
* tree log blocks never actually go into the extent allocation
@@ -7242,7 +7232,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
- btrfs_pin_extent(root, bytenr, num_bytes, 1);
+ btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
@@ -7397,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
spin_unlock(&cluster->refill_lock);
- down_read(&used_bg->data_rwsem);
+ /* We should only have one-level nested. */
+ down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
spin_lock(&cluster->refill_lock);
if (used_bg == cluster->block_group)
@@ -7433,8 +7424,9 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 hint_byte, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = orig_root->fs_info;
int ret = 0;
- struct btrfs_root *root = orig_root->fs_info->extent_root;
+ struct btrfs_root *root = fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
@@ -7450,16 +7442,16 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool orig_have_caching_bg = false;
bool full_search = false;
- WARN_ON(num_bytes < root->sectorsize);
+ WARN_ON(num_bytes < fs_info->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
+ trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
- space_info = __find_space_info(root->fs_info, flags);
+ space_info = __find_space_info(fs_info, flags);
if (!space_info) {
- btrfs_err(root->fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", flags);
return -ENOSPC;
}
@@ -7486,7 +7478,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
spin_unlock(&space_info->lock);
}
- last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
+ last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
@@ -7503,11 +7495,10 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
spin_unlock(&last_ptr->lock);
}
- search_start = max(search_start, first_logical_byte(root, 0));
+ search_start = max(search_start, first_logical_byte(fs_info, 0));
search_start = max(search_start, hint_byte);
if (search_start == hint_byte) {
- block_group = btrfs_lookup_block_group(root->fs_info,
- search_start);
+ block_group = btrfs_lookup_block_group(fs_info, search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -7615,7 +7606,7 @@ have_block_group:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(root,
+ trace_btrfs_reserve_extent_cluster(fs_info,
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
@@ -7671,7 +7662,7 @@ refill_cluster:
block_group->full_stripe_len);
/* allocate a cluster in this block group */
- ret = btrfs_find_space_cluster(root, block_group,
+ ret = btrfs_find_space_cluster(fs_info, block_group,
last_ptr, search_start,
num_bytes,
aligned_cluster);
@@ -7688,7 +7679,7 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
- trace_btrfs_reserve_extent_cluster(root,
+ trace_btrfs_reserve_extent_cluster(fs_info,
block_group, search_start,
num_bytes);
goto checks;
@@ -7760,7 +7751,7 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = ALIGN(offset, root->stripesize);
+ search_start = ALIGN(offset, fs_info->stripesize);
/* move on to the next group */
if (search_start + num_bytes >
@@ -7786,7 +7777,7 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
- trace_btrfs_reserve_extent(orig_root, block_group,
+ trace_btrfs_reserve_extent(fs_info, block_group,
search_start, num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
@@ -7847,7 +7838,7 @@ loop:
goto out;
}
- ret = do_chunk_alloc(trans, root, flags,
+ ret = do_chunk_alloc(trans, fs_info, flags,
CHUNK_ALLOC_FORCE);
/*
@@ -7867,7 +7858,7 @@ loop:
else
ret = 0;
if (!exist)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret)
goto out;
}
@@ -7959,7 +7950,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = btrfs_get_alloc_profile(root, is_data);
again:
- WARN_ON(num_bytes < root->sectorsize);
+ WARN_ON(num_bytes < fs_info->sectorsize);
ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
@@ -7967,7 +7958,8 @@ again:
} else if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
num_bytes = min(num_bytes >> 1, ins->offset);
- num_bytes = round_down(num_bytes, root->sectorsize);
+ num_bytes = round_down(num_bytes,
+ fs_info->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
ram_bytes = num_bytes;
if (num_bytes == min_alloc_size)
@@ -7977,7 +7969,7 @@ again:
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(fs_info, flags);
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"allocation failed flags %llu, wanted %llu",
flags, num_bytes);
if (sinfo)
@@ -7988,54 +7980,53 @@ again:
return ret;
}
-static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len,
int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(root->fs_info, start);
+ cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
- btrfs_err(root->fs_info, "Unable to find block group for %llu",
- start);
+ btrfs_err(fs_info, "Unable to find block group for %llu",
+ start);
return -ENOSPC;
}
if (pin)
- pin_down_extent(root, cache, start, len, 1);
+ pin_down_extent(fs_info, cache, start, len, 1);
else {
- if (btrfs_test_opt(root->fs_info, DISCARD))
- ret = btrfs_discard_extent(root, start, len, NULL);
+ if (btrfs_test_opt(fs_info, DISCARD))
+ ret = btrfs_discard_extent(fs_info, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_free_reserved_bytes(cache, len, delalloc);
- trace_btrfs_reserved_extent_free(root, start, len);
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
}
btrfs_put_block_group(cache);
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_root *root,
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
+ return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
}
-int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1, 0);
+ return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod)
{
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
@@ -8094,24 +8085,23 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
+ trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
return ret;
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins)
{
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_extent_item *extent_item;
struct btrfs_tree_block_info *block_info;
struct btrfs_extent_inline_ref *iref;
@@ -8119,16 +8109,15 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
u64 num_bytes = ins->offset;
- bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
- SKINNY_METADATA);
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
if (!skinny_metadata)
size += sizeof(*block_info);
path = btrfs_alloc_path();
if (!path) {
- btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->nodesize);
+ btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ fs_info->nodesize);
return -ENOMEM;
}
@@ -8137,8 +8126,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ins, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(root, ins->objectid,
- root->nodesize);
+ btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+ fs_info->nodesize);
return ret;
}
@@ -8152,7 +8141,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
- num_bytes = root->nodesize;
+ num_bytes = fs_info->nodesize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -8179,29 +8168,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(trans, root, ins->objectid, root->nodesize,
- 1);
+ ret = update_block_group(trans, fs_info, ins->objectid,
+ fs_info->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
+ trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
+ fs_info->nodesize);
return ret;
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
u64 root_objectid, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
@@ -8215,7 +8205,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
* space cache bits as well
*/
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins)
{
@@ -8227,13 +8217,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
* Mixed block groups will exclude before processing the log so we only
* need to do the exclude dance if this fs isn't mixed.
*/
- if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
- ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ ret = __exclude_logged_extent(fs_info, ins->objectid,
+ ins->offset);
if (ret)
return ret;
}
- block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+ block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
if (!block_group)
return -EINVAL;
@@ -8245,7 +8236,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+ ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
0, owner, offset, ins, 1);
btrfs_put_block_group(block_group);
return ret;
@@ -8255,16 +8246,17 @@ static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, int level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(root, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(buf))
return buf;
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
- clean_tree_block(trans, root->fs_info, buf);
+ clean_tree_block(trans, fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
@@ -8296,8 +8288,9 @@ static struct btrfs_block_rsv *
use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
bool global_updated = false;
@@ -8315,11 +8308,11 @@ again:
if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
global_updated = true;
- update_global_block_rsv(root->fs_info);
+ update_global_block_rsv(fs_info);
goto again;
}
- if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8363,18 +8356,18 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
- u32 blocksize = root->nodesize;
- bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
- SKINNY_METADATA);
+ u32 blocksize = fs_info->nodesize;
+ bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (btrfs_is_testing(root->fs_info)) {
+ if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
level);
if (!IS_ERR(buf))
@@ -8421,7 +8414,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->is_data = false;
extent_op->level = level;
- ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans,
ins.objectid, ins.offset,
parent, root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
@@ -8436,9 +8429,9 @@ out_free_delayed:
out_free_buf:
free_extent_buffer(buf);
out_free_reserved:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
out_unuse:
- unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+ unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
@@ -8464,6 +8457,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct walk_control *wc,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 refs;
@@ -8481,7 +8475,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
} else {
wc->reada_count = wc->reada_count * 3 / 2;
wc->reada_count = min_t(int, wc->reada_count,
- BTRFS_NODEPTRS_PER_BLOCK(root));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
}
eb = path->nodes[wc->level];
@@ -8503,7 +8497,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
/* We don't lock the tree block, it's OK to be racy here */
- ret = btrfs_lookup_extent_info(trans, root, bytenr,
+ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
&flags);
/* We don't care about errors in readahead. */
@@ -8532,226 +8526,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(root, bytenr);
+ readahead_tree_block(fs_info, bytenr);
nread++;
}
wc->reada_slot = slot;
}
-static int account_leaf_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *eb)
-{
- int nr = btrfs_header_nritems(eb);
- int i, extent_type, ret;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- u64 bytenr, num_bytes;
-
- /* We can be called directly from walk_up_proc() */
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
- return 0;
-
- for (i = 0; i < nr; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
-
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
-
- fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
- /* filter out non qgroup-accountable extents */
- extent_type = btrfs_file_extent_type(eb, fi);
-
- if (extent_type == BTRFS_FILE_EXTENT_INLINE)
- continue;
-
- bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
- if (!bytenr)
- continue;
-
- num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
- ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
- bytenr, num_bytes, GFP_NOFS);
- if (ret)
- return ret;
- }
- return 0;
-}
-
-/*
- * Walk up the tree from the bottom, freeing leaves and any interior
- * nodes which have had all slots visited. If a node (leaf or
- * interior) is freed, the node above it will have it's slot
- * incremented. The root node will never be freed.
- *
- * At the end of this function, we should have a path which has all
- * slots incremented to the next position for a search. If we need to
- * read a new node it will be NULL and the node above it will have the
- * correct slot selected for a later read.
- *
- * If we increment the root nodes slot counter past the number of
- * elements, 1 is returned to signal completion of the search.
- */
-static int adjust_slots_upwards(struct btrfs_root *root,
- struct btrfs_path *path, int root_level)
-{
- int level = 0;
- int nr, slot;
- struct extent_buffer *eb;
-
- if (root_level == 0)
- return 1;
-
- while (level <= root_level) {
- eb = path->nodes[level];
- nr = btrfs_header_nritems(eb);
- path->slots[level]++;
- slot = path->slots[level];
- if (slot >= nr || level == 0) {
- /*
- * Don't free the root - we will detect this
- * condition after our loop and return a
- * positive value for caller to stop walking the tree.
- */
- if (level != root_level) {
- btrfs_tree_unlock_rw(eb, path->locks[level]);
- path->locks[level] = 0;
-
- free_extent_buffer(eb);
- path->nodes[level] = NULL;
- path->slots[level] = 0;
- }
- } else {
- /*
- * We have a valid slot to walk back down
- * from. Stop here so caller can process these
- * new nodes.
- */
- break;
- }
-
- level++;
- }
-
- eb = path->nodes[root_level];
- if (path->slots[root_level] >= btrfs_header_nritems(eb))
- return 1;
-
- return 0;
-}
-
-/*
- * root_eb is the subtree root and is locked before this function is called.
- */
-static int account_shared_subtree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *root_eb,
- u64 root_gen,
- int root_level)
-{
- int ret = 0;
- int level;
- struct extent_buffer *eb = root_eb;
- struct btrfs_path *path = NULL;
-
- BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
- BUG_ON(root_eb == NULL);
-
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags))
- return 0;
-
- if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen);
- if (ret)
- goto out;
- }
-
- if (root_level == 0) {
- ret = account_leaf_items(trans, root, root_eb);
- goto out;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /*
- * Walk down the tree. Missing extent blocks are filled in as
- * we go. Metadata is accounted every time we read a new
- * extent block.
- *
- * When we reach a leaf, we account for file extent items in it,
- * walk back up the tree (adjusting slot pointers as we go)
- * and restart the search process.
- */
- extent_buffer_get(root_eb); /* For path */
- path->nodes[root_level] = root_eb;
- path->slots[root_level] = 0;
- path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
-walk_down:
- level = root_level;
- while (level >= 0) {
- if (path->nodes[level] == NULL) {
- int parent_slot;
- u64 child_gen;
- u64 child_bytenr;
-
- /* We need to get child blockptr/gen from
- * parent before we can read it. */
- eb = path->nodes[level + 1];
- parent_slot = path->slots[level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
-
- eb = read_tree_block(root, child_bytenr, child_gen);
- if (IS_ERR(eb)) {
- ret = PTR_ERR(eb);
- goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
- }
-
- path->nodes[level] = eb;
- path->slots[level] = 0;
-
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
- ret = btrfs_qgroup_insert_dirty_extent(trans,
- root->fs_info, child_bytenr,
- root->nodesize, GFP_NOFS);
- if (ret)
- goto out;
- }
-
- if (level == 0) {
- ret = account_leaf_items(trans, root, path->nodes[level]);
- if (ret)
- goto out;
-
- /* Nonzero return here means we completed our search */
- ret = adjust_slots_upwards(root, path, root_level);
- if (ret)
- break;
-
- /* Restart search with new slots */
- goto walk_down;
- }
-
- level--;
- }
-
- ret = 0;
-out:
- btrfs_free_path(path);
-
- return ret;
-}
-
/*
* helper to process tree block while walking down the tree.
*
@@ -8765,6 +8545,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc, int lookup_info)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -8782,7 +8563,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
- ret = btrfs_lookup_extent_info(trans, root,
+ ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
@@ -8810,7 +8591,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+ ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
eb->len, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
@@ -8846,6 +8627,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc, int *lookup_info)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 parent;
@@ -8871,11 +8653,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- blocksize = root->nodesize;
+ blocksize = fs_info->nodesize;
- next = btrfs_find_tree_block(root->fs_info, bytenr);
+ next = find_extent_buffer(fs_info, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(root, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -8886,14 +8668,14 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
+ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
&wc->flags[level - 1]);
if (ret < 0)
goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
- btrfs_err(root->fs_info, "Missing references.");
+ btrfs_err(fs_info, "Missing references.");
ret = -EIO;
goto out_unlock;
}
@@ -8935,7 +8717,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(root, bytenr, generation);
+ next = read_tree_block(fs_info, bytenr, generation);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -8980,16 +8762,17 @@ skip:
}
if (need_account) {
- ret = account_shared_subtree(trans, root, next,
- generation, level - 1);
+ ret = btrfs_qgroup_trace_subtree(trans, root, next,
+ generation, level - 1);
if (ret) {
- btrfs_err_rl(root->fs_info,
+ btrfs_err_rl(fs_info,
"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
ret);
}
}
- ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0);
+ ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
+ parent, root->root_key.objectid,
+ level - 1, 0);
if (ret)
goto out_unlock;
}
@@ -9021,6 +8804,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
@@ -9050,7 +8834,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
- ret = btrfs_lookup_extent_info(trans, root,
+ ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
@@ -9078,9 +8862,9 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
else
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = account_leaf_items(trans, root, eb);
+ ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
if (ret) {
- btrfs_err_rl(root->fs_info,
+ btrfs_err_rl(fs_info,
"error %d accounting leaf items. Quota is out of sync, rescan required.",
ret);
}
@@ -9092,7 +8876,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
- clean_tree_block(trans, root->fs_info, eb);
+ clean_tree_block(trans, fs_info, eb);
}
if (eb == root->node) {
@@ -9270,7 +9054,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
btrfs_set_lock_blocking(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
- ret = btrfs_lookup_extent_info(trans, root,
+ ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
&wc->flags[level]);
@@ -9296,7 +9080,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
wc->update_ref = update_ref;
wc->keep_locks = 0;
wc->for_reloc = for_reloc;
- wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
@@ -9326,8 +9110,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
BUG_ON(wc->level == 0);
- if (btrfs_should_end_transaction(trans, tree_root) ||
- (!for_reloc && btrfs_need_cleaner_sleep(root))) {
+ if (btrfs_should_end_transaction(trans) ||
+ (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
@@ -9337,8 +9121,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- btrfs_end_transaction_throttle(trans, tree_root);
- if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+ btrfs_end_transaction_throttle(trans);
+ if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
err = -EAGAIN;
@@ -9391,7 +9175,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
- btrfs_end_transaction_throttle(trans, tree_root);
+ btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
btrfs_free_path(path);
@@ -9421,6 +9205,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *node,
struct extent_buffer *parent)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct walk_control *wc;
int level;
@@ -9460,7 +9245,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->update_ref = 0;
wc->keep_locks = 1;
wc->for_reloc = 1;
- wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
wret = walk_down_tree(trans, root, path, wc);
@@ -9481,7 +9266,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 num_devices;
u64 stripped;
@@ -9490,11 +9275,11 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
* if restripe for this chunk_type is on pick target profile and
* return, otherwise do the usual balance
*/
- stripped = get_restripe_target(root->fs_info, flags);
+ stripped = get_restripe_target(fs_info, flags);
if (stripped)
return extended_to_chunk(stripped);
- num_devices = root->fs_info->fs_devices->rw_devices;
+ num_devices = fs_info->fs_devices->rw_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -9579,6 +9364,7 @@ int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
@@ -9593,14 +9379,14 @@ again:
* block groups cache has started writing. If it already started,
* back off and let this transaction commit
*/
- mutex_lock(&root->fs_info->ro_block_group_mutex);
+ mutex_lock(&fs_info->ro_block_group_mutex);
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
- mutex_unlock(&root->fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans, root);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
- ret = btrfs_wait_for_commit(root, transid);
+ ret = btrfs_wait_for_commit(fs_info, transid);
if (ret)
return ret;
goto again;
@@ -9610,9 +9396,9 @@ again:
* if we are changing raid levels, try to allocate a corresponding
* block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(root, cache->flags);
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, alloc_flags,
+ ret = do_chunk_alloc(trans, fs_info, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
@@ -9628,31 +9414,31 @@ again:
ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
- alloc_flags = get_alloc_profile(root, cache->space_info->flags);
- ret = do_chunk_alloc(trans, root, alloc_flags,
+ alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
+ ret = do_chunk_alloc(trans, fs_info, alloc_flags,
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(root, cache->flags);
- lock_chunks(root->fs_info->chunk_root);
- check_system_chunk(trans, root, alloc_flags);
- unlock_chunks(root->fs_info->chunk_root);
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ mutex_lock(&fs_info->chunk_mutex);
+ check_system_chunk(trans, fs_info, alloc_flags);
+ mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&root->fs_info->ro_block_group_mutex);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+ struct btrfs_fs_info *fs_info, u64 type)
{
- u64 alloc_flags = get_alloc_profile(root, type);
- return do_chunk_alloc(trans, root, alloc_flags,
- CHUNK_ALLOC_FORCE);
+ u64 alloc_flags = get_alloc_profile(fs_info, type);
+
+ return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
}
/*
@@ -9696,8 +9482,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_dec_block_group_ro(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache)
+void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -9723,11 +9508,12 @@ void btrfs_dec_block_group_ro(struct btrfs_root *root,
* @return - -1 if it's not a good idea to relocate this block group, 0 if its
* ok to go ahead and try.
*/
-int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
+int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
{
+ struct btrfs_root *root = fs_info->extent_root;
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct btrfs_trans_handle *trans;
u64 min_free;
@@ -9739,14 +9525,14 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
int full = 0;
int ret = 0;
- debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
+ debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
- block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
/* odd, couldn't find the block group, leave it alone */
if (!block_group) {
if (debug)
- btrfs_warn(root->fs_info,
+ btrfs_warn(fs_info,
"can't find block group for bytenr %llu",
bytenr);
return -1;
@@ -9796,7 +9582,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* 3: raid0
* 4: single
*/
- target = get_restripe_target(root->fs_info, block_group->flags);
+ target = get_restripe_target(fs_info, block_group->flags);
if (target) {
index = __get_raid_index(extended_to_chunk(target));
} else {
@@ -9806,9 +9592,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (full) {
if (debug)
- btrfs_warn(root->fs_info,
- "no space to alloc new chunk for block group %llu",
- block_group->key.objectid);
+ btrfs_warn(fs_info,
+ "no space to alloc new chunk for block group %llu",
+ block_group->key.objectid);
goto out;
}
@@ -9836,7 +9622,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
goto out;
}
- mutex_lock(&root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 dev_offset;
@@ -9858,19 +9644,21 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
}
}
if (debug && ret == -1)
- btrfs_warn(root->fs_info,
- "no space to allocate a new chunk for block group %llu",
- block_group->key.objectid);
- mutex_unlock(&root->fs_info->chunk_mutex);
- btrfs_end_transaction(trans, root);
+ btrfs_warn(fs_info,
+ "no space to allocate a new chunk for block group %llu",
+ block_group->key.objectid);
+ mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_end_transaction(trans);
out:
btrfs_put_block_group(block_group);
return ret;
}
-static int find_first_block_group(struct btrfs_root *root,
- struct btrfs_path *path, struct btrfs_key *key)
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
+ struct btrfs_root *root = fs_info->extent_root;
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -9904,7 +9692,7 @@ static int find_first_block_group(struct btrfs_root *root,
found_key.offset);
read_unlock(&em_tree->lock);
if (!em) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
found_key.objectid, found_key.offset);
ret = -ENOENT;
@@ -9934,8 +9722,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
if (block_group->iref)
break;
spin_unlock(&block_group->lock);
- block_group = next_block_group(info->tree_root,
- block_group);
+ block_group = next_block_group(info, block_group);
}
if (!block_group) {
if (last == 0)
@@ -10003,7 +9790,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
*/
if (block_group->cached == BTRFS_CACHE_NO ||
block_group->cached == BTRFS_CACHE_ERROR)
- free_excluded_extents(info->extent_root, block_group);
+ free_excluded_extents(info, block_group);
btrfs_remove_free_space_cache(block_group);
ASSERT(list_empty(&block_group->dirty_list));
@@ -10094,7 +9881,8 @@ out_err:
}
static struct btrfs_block_group_cache *
-btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
{
struct btrfs_block_group_cache *cache;
@@ -10113,11 +9901,11 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->key.offset = size;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = root->sectorsize;
- cache->fs_info = root->fs_info;
- cache->full_stripe_len = btrfs_full_stripe_len(root,
- &root->fs_info->mapping_tree,
- start);
+ cache->sectorsize = fs_info->sectorsize;
+ cache->fs_info = fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(fs_info,
+ &fs_info->mapping_tree,
+ start);
set_free_space_tree_thresholds(cache);
atomic_set(&cache->count, 1);
@@ -10136,12 +9924,11 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
return cache;
}
-int btrfs_read_block_groups(struct btrfs_root *root)
+int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
int ret;
struct btrfs_block_group_cache *cache;
- struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *space_info;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -10154,7 +9941,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
feature = btrfs_super_incompat_flags(info->super_copy);
mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
- root = info->extent_root;
key.objectid = 0;
key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -10163,15 +9949,15 @@ int btrfs_read_block_groups(struct btrfs_root *root)
return -ENOMEM;
path->reada = READA_FORWARD;
- cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
- btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
+ cache_gen = btrfs_super_cache_generation(info->super_copy);
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ btrfs_super_generation(info->super_copy) != cache_gen)
need_clear = 1;
- if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
+ if (btrfs_test_opt(info, CLEAR_CACHE))
need_clear = 1;
while (1) {
- ret = find_first_block_group(root, path, &key);
+ ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
@@ -10180,7 +9966,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- cache = btrfs_create_block_group_cache(root, found_key.objectid,
+ cache = btrfs_create_block_group_cache(info, found_key.objectid,
found_key.offset);
if (!cache) {
ret = -ENOMEM;
@@ -10198,7 +9984,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
+ if (btrfs_test_opt(info, SPACE_CACHE))
cache->disk_cache_state = BTRFS_DC_CLEAR;
}
@@ -10224,13 +10010,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* info has super bytes accounted for, otherwise we'll think
* we have more space than we actually do.
*/
- ret = exclude_super_stripes(root, cache);
+ ret = exclude_super_stripes(info, cache);
if (ret) {
/*
* We may have excluded something, so call this just in
* case.
*/
- free_excluded_extents(root, cache);
+ free_excluded_extents(info, cache);
btrfs_put_block_group(cache);
goto error;
}
@@ -10245,25 +10031,25 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- free_excluded_extents(root, cache);
+ free_excluded_extents(info, cache);
} else if (btrfs_block_group_used(&cache->item) == 0) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, root->fs_info,
+ add_new_free_space(cache, info,
found_key.objectid,
found_key.objectid +
found_key.offset);
- free_excluded_extents(root, cache);
+ free_excluded_extents(info, cache);
}
- ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ ret = btrfs_add_block_group_cache(info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
goto error;
}
- trace_btrfs_add_block_group(root->fs_info, cache, 0);
+ trace_btrfs_add_block_group(info, cache, 0);
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
cache->bytes_super, &space_info);
@@ -10282,8 +10068,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache);
- set_avail_alloc_bits(root->fs_info, cache->flags);
- if (btrfs_chunk_readonly(root, cache->key.objectid)) {
+ set_avail_alloc_bits(info, cache->flags);
+ if (btrfs_chunk_readonly(info, cache->key.objectid)) {
inc_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
@@ -10297,8 +10083,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
}
- list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
- if (!(get_alloc_profile(root, space_info->flags) &
+ list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ if (!(get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID5 |
@@ -10327,10 +10113,10 @@ error:
}
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *block_group, *tmp;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
@@ -10350,11 +10136,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
sizeof(item));
if (ret)
btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, extent_root,
- key.objectid, key.offset);
+ ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
+ key.offset);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, root->fs_info, block_group);
+ add_block_group_free_space(trans, fs_info, block_group);
/* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
@@ -10363,18 +10149,16 @@ next:
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytes_used,
+ struct btrfs_fs_info *fs_info, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size)
{
- int ret;
- struct btrfs_root *extent_root;
struct btrfs_block_group_cache *cache;
- extent_root = root->fs_info->extent_root;
+ int ret;
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
- cache = btrfs_create_block_group_cache(root, chunk_offset, size);
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
if (!cache)
return -ENOMEM;
@@ -10386,28 +10170,27 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->needs_free_space = 1;
- ret = exclude_super_stripes(root, cache);
+ ret = exclude_super_stripes(fs_info, cache);
if (ret) {
/*
* We may have excluded something, so call this just in
* case.
*/
- free_excluded_extents(root, cache);
+ free_excluded_extents(fs_info, cache);
btrfs_put_block_group(cache);
return ret;
}
- add_new_free_space(cache, root->fs_info, chunk_offset,
- chunk_offset + size);
+ add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
- free_excluded_extents(root, cache);
+ free_excluded_extents(fs_info, cache);
#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(root, cache)) {
+ if (btrfs_should_fragment_free_space(cache)) {
u64 new_bytes_used = size - bytes_used;
bytes_used += new_bytes_used >> 1;
- fragment_free_space(root, cache);
+ fragment_free_space(cache);
}
#endif
/*
@@ -10415,7 +10198,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* assigned to our block group, but don't update its counters just yet.
* We want our bg to be added to the rbtree with its ->space_info set.
*/
- ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
+ ret = update_space_info(fs_info, cache->flags, 0, 0, 0,
&cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -10423,7 +10206,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_add_block_group_cache(root->fs_info, cache);
+ ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
@@ -10434,26 +10217,26 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* Now that our block group has its ->space_info set and is inserted in
* the rbtree, update the space info's counters.
*/
- trace_btrfs_add_block_group(root->fs_info, cache, 1);
- ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+ trace_btrfs_add_block_group(fs_info, cache, 1);
+ ret = update_space_info(fs_info, cache->flags, size, bytes_used,
cache->bytes_super, &cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
- spin_lock(&root->fs_info->block_group_cache_lock);
+ spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&cache->cache_node,
- &root->fs_info->block_group_cache_tree);
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&cache->cache_node);
- spin_unlock(&root->fs_info->block_group_cache_lock);
+ spin_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
return ret;
}
- update_global_block_rsv(root->fs_info);
+ update_global_block_rsv(fs_info);
__link_block_group(cache->space_info, cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
- set_avail_alloc_bits(extent_root->fs_info, type);
+ set_avail_alloc_bits(fs_info, type);
return 0;
}
@@ -10473,13 +10256,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 group_start,
+ struct btrfs_fs_info *fs_info, u64 group_start,
struct extent_map *em)
{
+ struct btrfs_root *root = fs_info->extent_root;
struct btrfs_path *path;
struct btrfs_block_group_cache *block_group;
struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = root->fs_info->tree_root;
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_key key;
struct inode *inode;
struct kobject *kobj = NULL;
@@ -10489,9 +10273,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_em;
- root = root->fs_info->extent_root;
-
- block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+ block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
@@ -10499,7 +10281,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* Free the reserved super bytes from this block group before
* remove it.
*/
- free_excluded_extents(root, block_group);
+ free_excluded_extents(fs_info, block_group);
memcpy(&key, &block_group->key, sizeof(key));
index = get_block_group_index(block_group);
@@ -10511,7 +10293,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
factor = 1;
/* make sure this block group isn't part of an allocation cluster */
- cluster = &root->fs_info->data_alloc_cluster;
+ cluster = &fs_info->data_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
@@ -10520,7 +10302,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* make sure this block group isn't part of a metadata
* allocation cluster
*/
- cluster = &root->fs_info->meta_alloc_cluster;
+ cluster = &fs_info->meta_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
@@ -10549,9 +10331,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_wait_cache_io(root, trans, block_group,
- &block_group->io_ctl, path,
- block_group->key.objectid);
+ btrfs_wait_cache_io(trans, block_group, path);
btrfs_put_block_group(block_group);
spin_lock(&trans->transaction->dirty_bgs_lock);
}
@@ -10600,14 +10380,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
- spin_lock(&root->fs_info->block_group_cache_lock);
+ spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
- &root->fs_info->block_group_cache_tree);
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- if (root->fs_info->first_logical_byte == block_group->key.objectid)
- root->fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&root->fs_info->block_group_cache_lock);
+ if (fs_info->first_logical_byte == block_group->key.objectid)
+ fs_info->first_logical_byte = (u64)-1;
+ spin_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -10618,7 +10398,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
- clear_avail_alloc_bits(root->fs_info, block_group->flags);
+ clear_avail_alloc_bits(fs_info, block_group->flags);
}
up_write(&block_group->space_info->groups_sem);
if (kobj) {
@@ -10631,12 +10411,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- down_write(&root->fs_info->commit_root_sem);
+ down_write(&fs_info->commit_root_sem);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
list_for_each_entry(ctl,
- &root->fs_info->caching_block_groups, list)
+ &fs_info->caching_block_groups, list)
if (ctl->block_group == block_group) {
caching_ctl = ctl;
atomic_inc(&caching_ctl->count);
@@ -10645,7 +10425,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- up_write(&root->fs_info->commit_root_sem);
+ up_write(&fs_info->commit_root_sem);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
put_caching_control(caching_ctl);
@@ -10666,7 +10446,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
- if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
< block_group->key.offset);
WARN_ON(block_group->space_info->bytes_readonly
@@ -10682,7 +10462,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
memcpy(&key, &block_group->key, sizeof(key));
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&em->list)) {
/* We're in the transaction->pending_chunks list. */
free_extent_map(em);
@@ -10730,14 +10510,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* sees the em, either in the pending_chunks list or in the
* pinned_chunks list.
*/
- list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+ list_move_tail(&em->list, &fs_info->pinned_chunks);
}
spin_unlock(&block_group->lock);
if (remove_em) {
struct extent_map_tree *em_tree;
- em_tree = &root->fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
/*
* The em might be in the pending_chunks list, so make sure the
@@ -10750,9 +10530,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_extent_map(em);
}
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
- ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ ret = remove_block_group_free_space(trans, fs_info, block_group);
if (ret)
goto out;
@@ -10820,7 +10600,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
int ret = 0;
@@ -10881,7 +10660,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
trans = btrfs_start_trans_remove_block_group(fs_info,
block_group->key.objectid);
if (IS_ERR(trans)) {
- btrfs_dec_block_group_ro(root, block_group);
+ btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -10908,14 +10687,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(root, block_group);
+ btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(root, block_group);
+ btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10934,7 +10713,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&space_info->lock);
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(root->fs_info, DISCARD);
+ trimming = btrfs_test_opt(fs_info, DISCARD);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -10944,7 +10723,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
- ret = btrfs_remove_chunk(trans, root,
+ ret = btrfs_remove_chunk(trans, fs_info,
block_group->key.objectid);
if (ret) {
@@ -10971,7 +10750,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_get_block_group(block_group);
}
end_trans:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
next:
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
@@ -11018,9 +10797,10 @@ out:
return ret;
}
-int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
+ u64 start, u64 end)
{
- return unpin_extent_range(root, start, end, false);
+ return unpin_extent_range(fs_info, start, end, false);
}
/*
@@ -11060,7 +10840,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = 0;
while (1) {
- struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_transaction *trans;
u64 bytes;
@@ -11110,9 +10890,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
-int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_device *device;
struct list_head *devices;
@@ -11167,11 +10946,11 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
}
}
- cache = next_block_group(fs_info->tree_root, cache);
+ cache = next_block_group(fs_info, cache);
}
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- devices = &root->fs_info->fs_devices->alloc_list;
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ devices = &fs_info->fs_devices->alloc_list;
list_for_each_entry(device, devices, dev_alloc_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
@@ -11180,7 +10959,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
trimmed += group_trimmed;
}
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
range->len = trimmed;
return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8ed05d95584a..4ac383a3a649 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -127,7 +127,7 @@ struct extent_page_data {
*/
unsigned int extent_locked:1;
- /* tells the submit_bio code to use a WRITE_SYNC */
+ /* tells the submit_bio code to use REQ_SYNC */
unsigned int sync_io:1;
};
@@ -2029,7 +2029,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, WRITE, logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
@@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(bio)) {
@@ -2067,20 +2067,20 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return 0;
}
-int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
- int mirror_num)
+int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int mirror_num)
{
u64 start = eb->start;
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
int ret = 0;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
+ if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
- ret = repair_io_failure(root->fs_info->btree_inode, start,
+ ret = repair_io_failure(fs_info->btree_inode, start,
PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
@@ -2341,6 +2341,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct page *page, int pg_offset, int icsum,
bio_end_io_t *endio_func, void *data)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio *bio;
struct btrfs_io_bio *btrfs_failed_bio;
struct btrfs_io_bio *btrfs_bio;
@@ -2351,13 +2352,12 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+ bio->bi_bdev = fs_info->fs_devices->latest_bdev;
bio->bi_iter.bi_size = 0;
bio->bi_private = data;
btrfs_failed_bio = btrfs_io_bio(failed_bio);
if (btrfs_failed_bio->csum) {
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
btrfs_bio = btrfs_io_bio(bio);
@@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct bio *bio;
- int read_mode;
+ int read_mode = 0;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
}
if (failed_bio->bi_vcnt > 1)
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
- else
- read_mode = READ_SYNC;
+ read_mode |= REQ_FAILFAST_DEV;
phy_offset >>= inode->i_sb->s_blocksize_bits;
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
@@ -2476,6 +2474,8 @@ static void end_bio_extent_writepage(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
@@ -2484,11 +2484,11 @@ static void end_bio_extent_writepage(struct bio *bio)
* if they don't add up to a full page. */
if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+ btrfs_err(fs_info,
"partial page write in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
else
- btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+ btrfs_info(fs_info,
"incomplete page write in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
}
@@ -3484,7 +3484,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unsigned long nr_written = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC;
+ write_flags = REQ_SYNC;
trace___extent_writepage(page, inode, wbc);
@@ -3729,7 +3729,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
unsigned long i, num_pages;
unsigned long bio_flags = 0;
unsigned long start, end;
- int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
+ int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
int ret = 0;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -3743,16 +3743,15 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
if (btrfs_header_level(eb) > 0) {
end = btrfs_node_key_ptr_offset(nritems);
- memset_extent_buffer(eb, 0, end, eb->len - end);
+ memzero_extent_buffer(eb, end, eb->len - end);
} else {
/*
* leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
- end = btrfs_leaf_data(eb) +
- leaf_data_end(fs_info->tree_root, eb);
- memset_extent_buffer(eb, 0, start, end - start);
+ end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb);
+ memzero_extent_buffer(eb, start, end - start);
}
for (i = 0; i < num_pages; i++) {
@@ -4076,7 +4075,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
int ret;
bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
- epd->sync_io ? WRITE_SYNC : 0);
+ epd->sync_io ? REQ_SYNC : 0);
ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
@@ -4343,7 +4342,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
u64 last,
get_extent_t *get_extent)
{
- u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ u64 sectorsize = btrfs_inode_sectorsize(inode);
struct extent_map *em;
u64 len;
@@ -4404,8 +4403,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
- start = round_down(start, BTRFS_I(inode)->root->sectorsize);
- len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
+ start = round_down(start, btrfs_inode_sectorsize(inode));
+ len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
/*
* lookup the last file extent. We're not using i_size here
@@ -4539,7 +4538,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
root->objectid,
btrfs_ino(inode), bytenr);
if (trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret < 0)
goto out_free;
if (ret)
@@ -4720,9 +4719,9 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
WARN_ON(PageDirty(p));
SetPageUptodate(p);
new->pages[i] = p;
+ copy_page(page_address(p), page_address(src->pages[i]));
}
- copy_extent_buffer(new, src, 0, 0, src->len);
set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
@@ -4760,21 +4759,9 @@ err:
}
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, u32 nodesize)
+ u64 start)
{
- unsigned long len;
-
- if (!fs_info) {
- /*
- * Called only from tests that don't always have a fs_info
- * available
- */
- len = nodesize;
- } else {
- len = fs_info->tree_root->nodesize;
- }
-
- return __alloc_dummy_extent_buffer(fs_info, start, len);
+ return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
}
static void check_buffer_tree_ref(struct extent_buffer *eb)
@@ -4865,7 +4852,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, u32 nodesize)
+ u64 start)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4873,7 +4860,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
return NULL;
eb->fs_info = fs_info;
@@ -4913,7 +4900,7 @@ free_eb:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
- unsigned long len = fs_info->tree_root->nodesize;
+ unsigned long len = fs_info->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
unsigned long index = start >> PAGE_SHIFT;
@@ -4924,7 +4911,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int uptodate = 1;
int ret;
- if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
btrfs_err(fs_info, "bad tree block start %llu", start);
return ERR_PTR(-EINVAL);
}
@@ -5465,6 +5452,27 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
return ret;
}
+void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+ const void *srcv)
+{
+ char *kaddr;
+
+ WARN_ON(!PageUptodate(eb->pages[0]));
+ kaddr = page_address(eb->pages[0]);
+ memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
+ BTRFS_FSID_SIZE);
+}
+
+void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
+{
+ char *kaddr;
+
+ WARN_ON(!PageUptodate(eb->pages[0]));
+ kaddr = page_address(eb->pages[0]);
+ memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
+ BTRFS_FSID_SIZE);
+}
+
void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
unsigned long start, unsigned long len)
{
@@ -5496,8 +5504,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
}
}
-void memset_extent_buffer(struct extent_buffer *eb, char c,
- unsigned long start, unsigned long len)
+void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
{
size_t cur;
size_t offset;
@@ -5517,7 +5525,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
- memset(kaddr + offset, c, cur);
+ memset(kaddr + offset, 0, cur);
len -= cur;
offset = 0;
@@ -5525,6 +5533,20 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
}
}
+void copy_extent_buffer_full(struct extent_buffer *dst,
+ struct extent_buffer *src)
+{
+ int i;
+ unsigned num_pages;
+
+ ASSERT(dst->len == src->len);
+
+ num_pages = num_extent_pages(dst->start, dst->len);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+}
+
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
@@ -5766,6 +5788,7 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -5774,13 +5797,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- btrfs_err(dst->fs_info,
+ btrfs_err(fs_info,
"memmove bogus src_offset %lu move len %lu dst len %lu",
src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- btrfs_err(dst->fs_info,
+ btrfs_err(fs_info,
"memmove bogus dst_offset %lu move len %lu dst len %lu",
dst_offset, len, dst->len);
BUG_ON(1);
@@ -5812,6 +5835,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -5822,13 +5846,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- btrfs_err(dst->fs_info,
+ btrfs_err(fs_info,
"memmove bogus src_offset %lu move len %lu len %lu",
src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- btrfs_err(dst->fs_info,
+ btrfs_err(fs_info,
"memmove bogus dst_offset %lu move len %lu len %lu",
dst_offset, len, dst->len);
BUG_ON(1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ab31d145227e..17f9ce479ed7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -371,7 +371,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, u32 nodesize);
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -405,8 +405,13 @@ void read_extent_buffer(struct extent_buffer *eb, void *dst,
int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
unsigned long start,
unsigned long len);
+void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
+void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+ const void *src);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
+void copy_extent_buffer_full(struct extent_buffer *dst,
+ struct extent_buffer *src);
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
@@ -414,8 +419,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
-void memset_extent_buffer(struct extent_buffer *eb, char c,
- unsigned long start, unsigned long len);
+void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ unsigned long len);
int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
unsigned long pos);
void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
@@ -452,8 +457,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
- int mirror_num);
+int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@@ -491,5 +496,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, u32 nodesize);
+ u64 start);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d0d571c47d33..e97e322c28f0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,9 +34,9 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
-#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
- sizeof(u32) * (r)->sectorsize)
+ sizeof(u32) * (fs_info)->sectorsize)
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -90,13 +90,14 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct btrfs_key file_key;
struct btrfs_key found_key;
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -116,7 +117,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
csum_offset = (bytenr - found_key.offset) >>
- root->fs_info->sb->s_blocksize_bits;
+ fs_info->sb->s_blocksize_bits;
csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
csums_in_item /= csum_size;
@@ -159,11 +160,11 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
kfree(bio->csum_allocated);
}
-static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
- struct inode *inode, struct bio *bio,
+static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
{
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct bio_vec *bvec;
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -176,9 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 page_bytes_left;
u32 diff;
int nblocks;
- int bio_index = 0;
- int count;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ int count = 0, i;
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
@@ -223,8 +223,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (dio)
offset = logical_offset;
- page_bytes_left = bvec->bv_len;
- while (bio_index < bio->bi_vcnt) {
+ bio_for_each_segment_all(bvec, bio, i) {
+ page_bytes_left = bvec->bv_len;
+ if (count)
+ goto next;
+
if (!dio)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
@@ -239,7 +242,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (item)
btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+ item = btrfs_lookup_csum(NULL, fs_info->csum_root,
path, disk_bytenr, 0);
if (IS_ERR(item)) {
count = 1;
@@ -247,10 +250,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
- offset + root->sectorsize - 1,
+ offset + fs_info->sectorsize - 1,
EXTENT_NODATASUM);
} else {
- btrfs_info_rl(BTRFS_I(inode)->root->fs_info,
+ btrfs_info_rl(fs_info,
"no csum found for inode %llu start %llu",
btrfs_ino(inode), offset);
}
@@ -266,7 +269,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
path->slots[0]);
item_last_offset = item_start_offset +
(item_size / csum_size) *
- root->sectorsize;
+ fs_info->sectorsize;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
}
@@ -275,7 +278,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
* a single leaf so it will also fit inside a u32
*/
diff = disk_bytenr - item_start_offset;
- diff = diff / root->sectorsize;
+ diff = diff / fs_info->sectorsize;
diff = diff * csum_size;
count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
inode->i_sb->s_blocksize_bits);
@@ -285,48 +288,35 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
found:
csum += count * csum_size;
nblocks -= count;
-
+next:
while (count--) {
- disk_bytenr += root->sectorsize;
- offset += root->sectorsize;
- page_bytes_left -= root->sectorsize;
- if (!page_bytes_left) {
- bio_index++;
- /*
- * make sure we're still inside the
- * bio before we update page_bytes_left
- */
- if (bio_index >= bio->bi_vcnt) {
- WARN_ON_ONCE(count);
- goto done;
- }
- bvec++;
- page_bytes_left = bvec->bv_len;
- }
-
+ disk_bytenr += fs_info->sectorsize;
+ offset += fs_info->sectorsize;
+ page_bytes_left -= fs_info->sectorsize;
+ if (!page_bytes_left)
+ break; /* move to next bio */
}
}
-done:
+ WARN_ON_ONCE(count);
btrfs_free_path(path);
return 0;
}
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u32 *dst)
+int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
{
- return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+ return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
-int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 offset)
+int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
{
- return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
+ return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -337,10 +327,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- ASSERT(IS_ALIGNED(start, root->sectorsize) &&
- IS_ALIGNED(end + 1, root->sectorsize));
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
path = btrfs_alloc_path();
if (!path)
@@ -365,7 +355,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
offset = (start - key.offset) >>
- root->fs_info->sb->s_blocksize_bits;
+ fs_info->sb->s_blocksize_bits;
if (offset * csum_size <
btrfs_item_size_nr(leaf, path->slots[0] - 1))
path->slots[0]--;
@@ -393,7 +383,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
start = key.offset;
size = btrfs_item_size_nr(leaf, path->slots[0]);
- csum_end = key.offset + (size / csum_size) * root->sectorsize;
+ csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
if (csum_end <= start) {
path->slots[0]++;
continue;
@@ -404,8 +394,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
- MAX_ORDERED_SUM_BYTES(root));
- sums = kzalloc(btrfs_ordered_sum_size(root, size),
+ MAX_ORDERED_SUM_BYTES(fs_info));
+ sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
@@ -416,16 +406,16 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->len = (int)size;
offset = (start - key.offset) >>
- root->fs_info->sb->s_blocksize_bits;
+ fs_info->sb->s_blocksize_bits;
offset *= csum_size;
- size >>= root->fs_info->sb->s_blocksize_bits;
+ size >>= fs_info->sb->s_blocksize_bits;
read_extent_buffer(path->nodes[0],
sums->sums,
((unsigned long)item) + offset,
csum_size * size);
- start += root->sectorsize * size;
+ start += fs_info->sectorsize * size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@@ -443,23 +433,23 @@ fail:
return ret;
}
-int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u64 file_start, int contig)
+int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+ u64 file_start, int contig)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *sums;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *ordered = NULL;
char *data;
- struct bio_vec *bvec = bio->bi_io_vec;
- int bio_index = 0;
+ struct bio_vec *bvec;
int index;
int nr_sectors;
- int i;
+ int i, j;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
WARN_ON(bio->bi_vcnt <= 0);
- sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+ sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
GFP_NOFS);
if (!sums)
return -ENOMEM;
@@ -470,22 +460,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (contig)
offset = file_start;
else
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ offset = 0; /* shut up gcc */
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered); /* Logic error */
sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
- while (bio_index < bio->bi_vcnt) {
+ bio_for_each_segment_all(bvec, bio, j) {
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ if (!ordered) {
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
+ BUG_ON(!ordered); /* Logic error */
+ }
+
data = kmap_atomic(bvec->bv_page);
- nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- bvec->bv_len + root->sectorsize
- - 1);
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
+ bvec->bv_len + fs_info->sectorsize
+ - 1);
for (i = 0; i < nr_sectors; i++) {
if (offset >= ordered->file_offset + ordered->len ||
@@ -500,8 +493,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
bytes_left = bio->bi_iter.bi_size - total_bytes;
- sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
- GFP_NOFS);
+ sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left),
+ GFP_NOFS);
BUG_ON(!sums); /* -ENOMEM */
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
@@ -517,21 +510,18 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums->sums[index] = ~(u32)0;
sums->sums[index]
= btrfs_csum_data(data + bvec->bv_offset
- + (i * root->sectorsize),
+ + (i * fs_info->sectorsize),
sums->sums[index],
- root->sectorsize);
+ fs_info->sectorsize);
btrfs_csum_final(sums->sums[index],
(char *)(sums->sums + index));
index++;
- offset += root->sectorsize;
- this_sum_bytes += root->sectorsize;
- total_bytes += root->sectorsize;
+ offset += fs_info->sectorsize;
+ this_sum_bytes += fs_info->sectorsize;
+ total_bytes += fs_info->sectorsize;
}
kunmap_atomic(data);
-
- bio_index++;
- bvec++;
}
this_sum_bytes = 0;
btrfs_add_ordered_sum(inode, ordered, sums);
@@ -550,20 +540,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
-static noinline void truncate_one_csum(struct btrfs_root *root,
+static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
u64 csum_end;
u64 end_byte = bytenr + len;
- u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+ u32 blocksize_bits = fs_info->sb->s_blocksize_bits;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
- csum_end <<= root->fs_info->sb->s_blocksize_bits;
+ csum_end <<= fs_info->sb->s_blocksize_bits;
csum_end += key->offset;
if (key->offset < bytenr && csum_end <= end_byte) {
@@ -575,7 +565,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(root, path, new_size, 1);
+ btrfs_truncate_item(fs_info, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -587,10 +577,10 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(root, path, new_size, 0);
+ btrfs_truncate_item(fs_info, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(root->fs_info, path, key);
+ btrfs_set_item_key_safe(fs_info, path, key);
} else {
BUG();
}
@@ -601,18 +591,17 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
* range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr, u64 len)
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 len)
{
+ struct btrfs_root *root = fs_info->csum_root;
struct btrfs_path *path;
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
int ret;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
-
- root = root->fs_info->csum_root;
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ int blocksize_bits = fs_info->sb->s_blocksize_bits;
path = btrfs_alloc_path();
if (!path)
@@ -689,7 +678,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
item_offset = btrfs_item_ptr_offset(leaf,
path->slots[0]);
- memset_extent_buffer(leaf, 0, item_offset + offset,
+ memzero_extent_buffer(leaf, item_offset + offset,
shift_len);
key.offset = bytenr;
@@ -705,7 +694,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
} else {
- truncate_one_csum(root, path, &key, bytenr, len);
+ truncate_one_csum(fs_info, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -721,6 +710,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key file_key;
struct btrfs_key found_key;
struct btrfs_path *path;
@@ -736,7 +726,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
int index = 0;
int found_next;
int ret;
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
path = btrfs_alloc_path();
if (!path)
@@ -769,7 +759,7 @@ again:
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if ((item_size / csum_size) >=
- MAX_CSUM_ITEMS(root, csum_size)) {
+ MAX_CSUM_ITEMS(fs_info, csum_size)) {
/* already at max size, make a new one */
goto insert;
}
@@ -815,11 +805,11 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
csum_offset = (bytenr - found_key.offset) >>
- root->fs_info->sb->s_blocksize_bits;
+ fs_info->sb->s_blocksize_bits;
if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+ csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) {
goto insert;
}
@@ -830,26 +820,27 @@ again:
u32 diff;
u32 free_space;
- if (btrfs_leaf_free_space(root, leaf) <
+ if (btrfs_leaf_free_space(fs_info, leaf) <
sizeof(struct btrfs_item) + csum_size * 2)
goto insert;
- free_space = btrfs_leaf_free_space(root, leaf) -
+ free_space = btrfs_leaf_free_space(fs_info, leaf) -
sizeof(struct btrfs_item) - csum_size;
tmp = sums->len - total_bytes;
- tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sb->s_blocksize_bits;
WARN_ON(tmp < 1);
extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
- diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
+ diff = min(diff,
+ MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
diff = min(free_space, diff);
diff /= csum_size;
diff *= csum_size;
- btrfs_extend_item(root, path, diff);
+ btrfs_extend_item(fs_info, path, diff);
ret = 0;
goto csum;
}
@@ -861,12 +852,12 @@ insert:
u64 tmp;
tmp = sums->len - total_bytes;
- tmp >>= root->fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sb->s_blocksize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
- root->fs_info->sb->s_blocksize_bits);
+ fs_info->sb->s_blocksize_bits);
tmp = max((u64)1, tmp);
- tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+ tmp = min(tmp, (u64)MAX_CSUM_ITEMS(fs_info, csum_size));
ins_size = csum_size * tmp;
} else {
ins_size = csum_size;
@@ -888,7 +879,7 @@ csum:
csum_offset * csum_size);
found:
ins_size = (u32)(sums->len - total_bytes) >>
- root->fs_info->sb->s_blocksize_bits;
+ fs_info->sb->s_blocksize_bits;
ins_size *= csum_size;
ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
ins_size);
@@ -896,7 +887,7 @@ found:
ins_size);
ins_size /= csum_size;
- total_bytes += ins_size * root->sectorsize;
+ total_bytes += ins_size * fs_info->sectorsize;
index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -919,6 +910,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
const bool new_inline,
struct extent_map *em)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
@@ -928,7 +920,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
@@ -939,7 +931,8 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, slot, fi);
- extent_end = ALIGN(extent_start + size, root->sectorsize);
+ extent_end = ALIGN(extent_start + size,
+ fs_info->sectorsize);
}
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
@@ -982,7 +975,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
em->compress_type = compress_type;
}
} else {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, root %llu",
type, btrfs_ino(inode), extent_start,
root->root_key.objectid);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3a14c87d9c92..b5c5da215d05 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -27,7 +27,6 @@
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
@@ -96,13 +95,13 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
int ret;
- p = &root->fs_info->defrag_inodes.rb_node;
+ p = &fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -126,16 +125,16 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
}
set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+ rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
-static inline int __need_auto_defrag(struct btrfs_root *root)
+static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
{
- if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
+ if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
- if (btrfs_fs_closing(root->fs_info))
+ if (btrfs_fs_closing(fs_info))
return 0;
return 1;
@@ -148,12 +147,13 @@ static inline int __need_auto_defrag(struct btrfs_root *root)
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
u64 transid;
int ret;
- if (!__need_auto_defrag(root))
+ if (!__need_auto_defrag(fs_info))
return 0;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -172,7 +172,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->transid = transid;
defrag->root = root->root_key.objectid;
- spin_lock(&root->fs_info->defrag_inodes_lock);
+ spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
/*
* If we set IN_DEFRAG flag and evict the inode from memory,
@@ -185,7 +185,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
- spin_unlock(&root->fs_info->defrag_inodes_lock);
+ spin_unlock(&fs_info->defrag_inodes_lock);
return 0;
}
@@ -197,19 +197,19 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
static void btrfs_requeue_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
- if (!__need_auto_defrag(root))
+ if (!__need_auto_defrag(fs_info))
goto out;
/*
* Here we don't check the IN_DEFRAG flag, because we need merge
* them together.
*/
- spin_lock(&root->fs_info->defrag_inodes_lock);
+ spin_lock(&fs_info->defrag_inodes_lock);
ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&root->fs_info->defrag_inodes_lock);
+ spin_unlock(&fs_info->defrag_inodes_lock);
if (ret)
goto out;
return;
@@ -373,7 +373,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
&fs_info->fs_state))
break;
- if (!__need_auto_defrag(fs_info->tree_root))
+ if (!__need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
@@ -485,11 +485,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- struct page **pages, size_t num_pages,
- loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int err = 0;
int i;
u64 num_bytes;
@@ -498,8 +498,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(inode);
- start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
+ start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+ num_bytes = round_up(write_bytes + pos - start_pos,
+ fs_info->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -696,6 +697,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
u32 extent_item_size,
int *key_inserted)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
@@ -706,6 +708,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
+ u64 last_end = start;
int del_nr = 0;
int del_slot = 0;
int extent_type;
@@ -723,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == root->fs_info->tree_root);
+ root == fs_info->tree_root);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -797,8 +800,10 @@ next_slot:
* extent item in the call to setup_items_for_insert() later
* in this function.
*/
- if (extent_end == key.offset && extent_end >= search_start)
+ if (extent_end == key.offset && extent_end >= search_start) {
+ last_end = extent_end;
goto delete_extent_item;
+ }
if (extent_end <= search_start) {
path->slots[0]++;
@@ -851,7 +856,7 @@ next_slot:
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
- ret = btrfs_inc_extent_ref(trans, root,
+ ret = btrfs_inc_extent_ref(trans, fs_info,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
@@ -861,6 +866,12 @@ next_slot:
key.offset = start;
}
/*
+ * From here on out we will have actually dropped something, so
+ * last_end can be updated.
+ */
+ last_end = extent_end;
+
+ /*
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
@@ -872,7 +883,7 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
- btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
extent_offset += end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -927,9 +938,9 @@ delete_extent_item:
inode_sub_bytes(inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
- root->sectorsize);
+ fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- ret = btrfs_free_extent(trans, root,
+ ret = btrfs_free_extent(trans, fs_info,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
@@ -986,7 +997,7 @@ delete_extent_item:
if (!ret && replace_extent && leafs_visited == 1 &&
(path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
path->locks[0] == BTRFS_WRITE_LOCK) &&
- btrfs_leaf_free_space(root, leaf) >=
+ btrfs_leaf_free_space(fs_info, leaf) >=
sizeof(struct btrfs_item) + extent_item_size) {
key.objectid = ino;
@@ -1010,7 +1021,7 @@ delete_extent_item:
if (!replace_extent || !(*key_inserted))
btrfs_release_path(path);
if (drop_end)
- *drop_end = found ? min(end, extent_end) : end;
+ *drop_end = found ? min(end, last_end) : end;
return ret;
}
@@ -1073,6 +1084,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_path *path;
@@ -1142,7 +1154,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -1176,7 +1188,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(root->fs_info, path, &new_key);
+ btrfs_set_item_key_safe(fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1222,8 +1234,8 @@ again:
extent_end - split);
btrfs_mark_buffer_dirty(leaf);
- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
- root->root_key.objectid,
+ ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes,
+ 0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1256,7 +1268,7 @@ again:
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
@@ -1276,7 +1288,7 @@ again:
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset);
if (ret) {
@@ -1409,15 +1421,16 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
- start_pos = round_down(pos, root->sectorsize);
+ start_pos = round_down(pos, fs_info->sectorsize);
last_pos = start_pos
- + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
+ + round_up(pos + write_bytes - start_pos,
+ fs_info->sectorsize) - 1;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1464,6 +1477,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
static noinline int check_can_nocow(struct inode *inode, loff_t pos,
size_t *write_bytes)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered;
u64 lockstart, lockend;
@@ -1474,8 +1488,9 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
if (!ret)
return -ENOSPC;
- lockstart = round_down(pos, root->sectorsize);
- lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
+ lockstart = round_down(pos, fs_info->sectorsize);
+ lockend = round_up(pos + *write_bytes,
+ fs_info->sectorsize) - 1;
while (1) {
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1509,6 +1524,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
loff_t pos)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_state *cached_state = NULL;
@@ -1555,9 +1571,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- sector_offset = pos & (root->sectorsize - 1);
+ sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
- root->sectorsize);
+ fs_info->sectorsize);
ret = btrfs_check_data_free_space(inode, pos, write_bytes);
if (ret < 0) {
@@ -1577,7 +1593,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
PAGE_SIZE);
reserve_bytes = round_up(write_bytes +
sector_offset,
- root->sectorsize);
+ fs_info->sectorsize);
} else {
break;
}
@@ -1621,12 +1637,10 @@ again:
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
- num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- reserve_bytes);
+ num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
- root->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
- dirty_sectors);
+ fs_info->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
/*
* if we have trouble faulting in the pages, fall
@@ -1654,11 +1668,9 @@ again:
* managed to copy.
*/
if (num_sectors > dirty_sectors) {
-
/* release everything except the sectors we dirtied */
release_bytes -= dirty_sectors <<
- root->fs_info->sb->s_blocksize_bits;
-
+ fs_info->sb->s_blocksize_bits;
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -1670,7 +1682,8 @@ again:
} else {
u64 __pos;
- __pos = round_down(pos, root->sectorsize) +
+ __pos = round_down(pos,
+ fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode, __pos,
release_bytes);
@@ -1678,12 +1691,11 @@ again:
}
release_bytes = round_up(copied + sector_offset,
- root->sectorsize);
+ fs_info->sectorsize);
if (copied > 0)
- ret = btrfs_dirty_pages(root, inode, pages,
- dirty_pages, pos, copied,
- NULL);
+ ret = btrfs_dirty_pages(inode, pages, dirty_pages,
+ pos, copied, NULL);
if (need_unlock)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state,
@@ -1698,8 +1710,10 @@ again:
btrfs_end_write_no_snapshoting(root);
if (only_release_metadata && copied > 0) {
- lockstart = round_down(pos, root->sectorsize);
- lockend = round_up(pos + copied, root->sectorsize) - 1;
+ lockstart = round_down(pos,
+ fs_info->sectorsize);
+ lockend = round_up(pos + copied,
+ fs_info->sectorsize) - 1;
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
@@ -1712,8 +1726,8 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root);
+ if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
@@ -1727,7 +1741,7 @@ again:
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
btrfs_delalloc_release_space(inode,
- round_down(pos, root->sectorsize),
+ round_down(pos, fs_info->sectorsize),
release_bytes);
}
}
@@ -1798,6 +1812,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start_pos;
u64 end_pos;
@@ -1829,7 +1844,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* although we have opened a file as writable, we have
* to stop this write operation to ensure FS consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
inode_unlock(inode);
err = -EROFS;
goto out;
@@ -1845,17 +1860,18 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
- start_pos = round_down(pos, root->sectorsize);
+ start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
- end_pos = round_up(pos + count, root->sectorsize);
+ end_pos = round_up(pos + count,
+ fs_info->sectorsize);
err = btrfs_cont_expand(inode, oldsize, end_pos);
if (err) {
inode_unlock(inode);
goto out;
}
- if (start_pos > round_up(oldsize, root->sectorsize))
+ if (start_pos > round_up(oldsize, fs_info->sectorsize))
clean_page = 1;
}
@@ -1935,6 +1951,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
@@ -2045,12 +2062,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* commit does not start nor waits for ordered extents to complete.
*/
smp_mb();
- if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+ if (btrfs_inode_in_log(inode, fs_info->generation) ||
(full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed) ||
+ fs_info->last_trans_committed) ||
(!btrfs_have_ordered_extents_in_range(inode, start, len) &&
BTRFS_I(inode)->last_trans
- <= root->fs_info->last_trans_committed)) {
+ <= fs_info->last_trans_committed)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2129,7 +2146,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* which are indicated by ctx.io_err.
*/
if (ctx.io_err) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
ret = ctx.io_err;
goto out;
}
@@ -2138,20 +2155,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
goto out;
}
}
if (!full_sync) {
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
} else {
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
}
out:
return ret > 0 ? -EIO : ret;
@@ -2208,6 +2225,7 @@ static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
struct btrfs_path *path, u64 offset, u64 end)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
@@ -2216,7 +2234,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
struct btrfs_key key;
int ret;
- if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
goto out;
key.objectid = btrfs_ino(inode);
@@ -2224,9 +2242,15 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0)
+ if (ret <= 0) {
+ /*
+ * We should have dropped this offset, so if we find it then
+ * something has gone horribly wrong.
+ */
+ if (ret == 0)
+ ret = -EINVAL;
return ret;
- BUG_ON(!ret);
+ }
leaf = path->nodes[0];
if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
@@ -2248,7 +2272,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
u64 num_bytes;
key.offset = offset;
- btrfs_set_item_key_safe(root->fs_info, path, &key);
+ btrfs_set_item_key_safe(fs_info, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2284,7 +2308,7 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -2336,6 +2360,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
@@ -2347,13 +2372,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 tail_len;
u64 orig_start = offset;
u64 cur_offset;
- u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
u64 drop_end;
int ret = 0;
int err = 0;
unsigned int rsv_count;
bool same_block;
- bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
u64 ino_size;
bool truncated_block = false;
bool updated_inode = false;
@@ -2363,7 +2388,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
inode_lock(inode);
- ino_size = round_up(inode->i_size, root->sectorsize);
+ ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(inode, &offset, &len);
if (ret < 0)
goto out_only_mutex;
@@ -2373,11 +2398,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
lockend = round_down(offset + len,
- BTRFS_I(inode)->root->sectorsize) - 1;
- same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
- == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
+ btrfs_inode_sectorsize(inode)) - 1;
+ same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
+ == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
* We needn't truncate any block which is beyond the end of the file
* because we are sure there is no data there.
@@ -2386,7 +2411,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* Only do this if we are in the same block and we aren't doing the
* entire block.
*/
- if (same_block && len < root->sectorsize) {
+ if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(inode, offset, len, 0);
@@ -2489,12 +2514,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
ret = -ENOMEM;
goto out_free;
}
- rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
rsv->failfast = 1;
/*
@@ -2509,7 +2534,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_free;
}
- ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, 0);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2523,12 +2548,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret != -ENOSPC)
break;
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
- if (cur_offset < ino_size) {
+ if (cur_offset < drop_end && cur_offset < ino_size) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_end);
if (ret) {
+ /*
+ * If we failed then we didn't insert our hole
+ * entries for the area we dropped, so now the
+ * fs is corrupted, so we must abort the
+ * transaction.
+ */
+ btrfs_abort_transaction(trans, ret);
err = ret;
break;
}
@@ -2542,8 +2574,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
break;
}
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, rsv_count);
if (IS_ERR(trans)) {
@@ -2552,7 +2584,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
break;
}
- ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
@@ -2571,7 +2603,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_trans;
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
/*
* If we are using the NO_HOLES feature we might have had already an
* hole that overlaps a part of the region [lockstart, lockend] and
@@ -2593,6 +2625,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, inode, path, cur_offset, drop_end);
if (ret) {
+ /* Same comment as above. */
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_trans;
}
@@ -2605,14 +2639,14 @@ out_trans:
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
updated_inode = true;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
out_free:
btrfs_free_path(path);
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
@@ -2630,7 +2664,7 @@ out_only_mutex:
err = PTR_ERR(trans);
} else {
err = btrfs_update_inode(trans, root, inode);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
}
}
inode_unlock(inode);
@@ -2695,7 +2729,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = BTRFS_I(inode)->root->sectorsize;
+ int blocksize = btrfs_inode_sectorsize(inode);
int ret;
alloc_start = round_down(offset, blocksize);
@@ -2872,9 +2906,9 @@ static long btrfs_fallocate(struct file *file, int mode,
btrfs_ordered_update_i_size(inode, actual_end, NULL);
ret = btrfs_update_inode(trans, root, inode);
if (ret)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
else
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
}
}
out_unlock:
@@ -2891,7 +2925,7 @@ out:
static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 lockstart;
@@ -2909,10 +2943,11 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
*/
start = max_t(loff_t, 0, *offset);
- lockstart = round_down(start, root->sectorsize);
- lockend = round_up(i_size_read(inode), root->sectorsize);
+ lockstart = round_down(start, fs_info->sectorsize);
+ lockend = round_up(i_size_read(inode),
+ fs_info->sectorsize);
if (lockend <= lockstart)
- lockend = lockstart + root->sectorsize;
+ lockend = lockstart + fs_info->sectorsize;
lockend--;
len = lockend - lockstart + 1;
@@ -2998,7 +3033,6 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
- .copy_file_range = btrfs_copy_file_range,
.clone_file_range = btrfs_clone_file_range,
.dedupe_file_range = btrfs_dedupe_file_range,
};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e4b48f377d3a..7015892c9ee8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -42,11 +42,16 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
+static int btrfs_wait_cache_io_root(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
@@ -74,9 +79,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
- inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
- if (!inode)
- return ERR_PTR(-ENOENT);
+ inode = btrfs_iget(fs_info->sb, &location, root, NULL);
if (IS_ERR(inode))
return inode;
if (is_bad_inode(inode)) {
@@ -96,6 +99,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
*block_group, struct btrfs_path *path)
{
struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
spin_lock(&block_group->lock);
@@ -112,8 +116,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
spin_lock(&block_group->lock);
if (!((BTRFS_I(inode)->flags & flags) == flags)) {
- btrfs_info(root->fs_info,
- "Old style space inode found, converting.");
+ btrfs_info(fs_info, "Old style space inode found, converting.");
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
BTRFS_INODE_NODATACOW;
block_group->disk_cache_state = BTRFS_DC_CLEAR;
@@ -153,7 +156,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
btrfs_item_key(leaf, &disk_key, path->slots[0]);
- memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+ memzero_extent_buffer(leaf, (unsigned long)inode_item,
sizeof(*inode_item));
btrfs_set_inode_generation(leaf, inode_item, trans->transid);
btrfs_set_inode_size(leaf, inode_item, 0);
@@ -181,7 +184,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
- memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+ memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -205,15 +208,15 @@ int create_free_space_inode(struct btrfs_root *root,
block_group->key.objectid);
}
-int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
u64 needed_bytes;
int ret;
/* 1 for slack space, 1 for updating the inode */
- needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
- btrfs_calc_trans_metadata_size(root, 1);
+ needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
+ btrfs_calc_trans_metadata_size(fs_info, 1);
spin_lock(&rsv->lock);
if (rsv->reserved < needed_bytes)
@@ -244,9 +247,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
- btrfs_wait_cache_io(root, trans, block_group,
- &block_group->io_ctl, path,
- block_group->key.objectid);
+ btrfs_wait_cache_io(trans, block_group, path);
btrfs_put_block_group(block_group);
}
@@ -305,7 +306,7 @@ static int readahead_cache(struct inode *inode)
}
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root, int write)
+ int write)
{
int num_pages;
int check_crcs = 0;
@@ -327,7 +328,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
return -ENOMEM;
io_ctl->num_pages = num_pages;
- io_ctl->root = root;
+ io_ctl->fs_info = btrfs_sb(inode->i_sb);
io_ctl->check_crcs = check_crcs;
io_ctl->inode = inode;
@@ -450,7 +451,7 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- btrfs_err_rl(io_ctl->root->fs_info,
+ btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
*gen, generation);
io_ctl_unmap_page(io_ctl);
@@ -476,7 +477,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_SIZE - offset);
- btrfs_csum_final(crc, (char *)&crc);
+ btrfs_csum_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
tmp += index;
@@ -504,9 +505,9 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_SIZE - offset);
- btrfs_csum_final(crc, (char *)&crc);
+ btrfs_csum_final(crc, (u8 *)&crc);
if (val != crc) {
- btrfs_err_rl(io_ctl->root->fs_info,
+ btrfs_err_rl(io_ctl->fs_info,
"csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
@@ -669,6 +670,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct btrfs_io_ctl io_ctl;
@@ -708,23 +710,23 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
btrfs_release_path(path);
if (!BTRFS_I(inode)->generation) {
- btrfs_info(root->fs_info,
+ btrfs_info(fs_info,
"The free space cache file (%llu) is invalid. skip it\n",
offset);
return 0;
}
if (BTRFS_I(inode)->generation != generation) {
- btrfs_err(root->fs_info,
- "free space inode generation (%llu) did not match free space cache generation (%llu)",
- BTRFS_I(inode)->generation, generation);
+ btrfs_err(fs_info,
+ "free space inode generation (%llu) did not match free space cache generation (%llu)",
+ BTRFS_I(inode)->generation, generation);
return 0;
}
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root, 0);
+ ret = io_ctl_init(&io_ctl, inode, 0);
if (ret)
return ret;
@@ -766,7 +768,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = link_free_space(ctl, e);
spin_unlock(&ctl->tree_lock);
if (ret) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
@@ -786,7 +788,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ctl->op->recalc_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
@@ -1033,7 +1035,7 @@ fail:
}
static noinline_for_stack int
-write_pinned_extent_entries(struct btrfs_root *root,
+write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
@@ -1052,7 +1054,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
- unpin = root->fs_info->pinned_extents;
+ unpin = fs_info->pinned_extents;
start = block_group->key.objectid;
@@ -1135,20 +1137,20 @@ cleanup_write_cache_enospc(struct inode *inode,
GFP_NOFS);
}
-int btrfs_wait_cache_io(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path, u64 offset)
+static int __btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset)
{
int ret;
struct inode *inode = io_ctl->inode;
+ struct btrfs_fs_info *fs_info;
if (!inode)
return 0;
- if (block_group)
- root = root->fs_info->tree_root;
+ fs_info = btrfs_sb(inode->i_sb);
/* Flush the dirty pages in the cache file. */
ret = flush_dirty_cache(inode);
@@ -1165,9 +1167,9 @@ out:
BTRFS_I(inode)->generation = 0;
if (block_group) {
#ifdef DEBUG
- btrfs_err(root->fs_info,
- "failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ btrfs_err(fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
#endif
}
}
@@ -1200,6 +1202,23 @@ out:
}
+static int btrfs_wait_cache_io_root(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path)
+{
+ return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
+}
+
+int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
+ block_group, &block_group->io_ctl,
+ path, block_group->key.objectid);
+}
+
/**
* __btrfs_write_out_cache - write out cached info to an inode
* @root - the root the inode belongs to
@@ -1220,6 +1239,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 offset)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_state *cached_state = NULL;
LIST_HEAD(bitmap_list);
int entries = 0;
@@ -1231,7 +1251,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
return -EIO;
WARN_ON(io_ctl->pages);
- ret = io_ctl_init(io_ctl, inode, root, 1);
+ ret = io_ctl_init(io_ctl, inode, 1);
if (ret)
return ret;
@@ -1277,7 +1297,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
+ ret = write_pinned_extent_entries(fs_info, block_group,
+ io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@@ -1296,8 +1317,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
- 0, i_size_read(inode), &cached_state);
+ ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
+ i_size_read(inode), &cached_state);
if (ret)
goto out_nospc;
@@ -1352,17 +1373,16 @@ out_nospc:
goto out;
}
-int btrfs_write_out_cache(struct btrfs_root *root,
+int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path)
{
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
- root = root->fs_info->tree_root;
-
spin_lock(&block_group->lock);
if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
spin_unlock(&block_group->lock);
@@ -1379,9 +1399,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
path, block_group->key.objectid);
if (ret) {
#ifdef DEBUG
- btrfs_err(root->fs_info,
- "failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ btrfs_err(fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
#endif
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
@@ -1968,11 +1988,11 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
- block_group))
+ if (btrfs_should_fragment_free_space(block_group))
forced = true;
#endif
@@ -1988,7 +2008,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
- if (info->bytes <= block_group->sectorsize * 4) {
+ if (info->bytes <= fs_info->sectorsize * 4) {
if (ctl->free_extents * 2 <= ctl->extents_thresh)
return false;
} else {
@@ -2447,6 +2467,7 @@ out:
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
struct rb_node *n;
@@ -2456,23 +2477,23 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
count++;
- btrfs_crit(block_group->fs_info,
- "entry offset %llu, bytes %llu, bitmap %s",
+ btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
- btrfs_info(block_group->fs_info, "block group has cluster?: %s",
+ btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
- btrfs_info(block_group->fs_info,
+ btrfs_info(fs_info,
"%d blocks of free space at or bigger than bytes is", count);
}
void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
- ctl->unit = block_group->sectorsize;
+ ctl->unit = fs_info->sectorsize;
ctl->start = block_group->key.objectid;
ctl->private = block_group;
ctl->op = &free_space_op;
@@ -3011,7 +3032,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* returns zero and sets up cluster if things worked out, otherwise
* it returns -enospc
*/
-int btrfs_find_space_cluster(struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size)
@@ -3029,14 +3050,14 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
* For metadata, allow allocates with smaller extents. For
* data, keep it dense.
*/
- if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) {
+ if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
- min_bytes = block_group->sectorsize;
+ min_bytes = fs_info->sectorsize;
} else {
cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
- min_bytes = block_group->sectorsize;
+ min_bytes = fs_info->sectorsize;
}
spin_lock(&ctl->tree_lock);
@@ -3124,8 +3145,7 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- ret = btrfs_discard_extent(fs_info->extent_root,
- start, bytes, &trimmed);
+ ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
if (!ret)
*total_trimmed += trimmed;
@@ -3321,6 +3341,7 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_map_tree *em_tree;
struct extent_map *em;
bool cleanup;
@@ -3331,8 +3352,8 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- lock_chunks(block_group->fs_info->chunk_root);
- em_tree = &block_group->fs_info->mapping_tree.map_tree;
+ mutex_lock(&fs_info->chunk_mutex);
+ em_tree = &fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->key.objectid,
1);
@@ -3343,7 +3364,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
*/
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- unlock_chunks(block_group->fs_info->chunk_root);
+ mutex_unlock(&fs_info->chunk_mutex);
/* once for us and once for the tree */
free_extent_map(em);
@@ -3473,7 +3494,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
int ret = 0;
u64 root_gen = btrfs_root_generation(&root->root_item);
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
/*
@@ -3512,12 +3533,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_path *path,
struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
int ret;
struct btrfs_io_ctl io_ctl;
bool release_metadata = true;
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
memset(&io_ctl, 0, sizeof(io_ctl));
@@ -3531,16 +3553,16 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
* with or without an error.
*/
release_metadata = false;
- ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+ ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
}
if (ret) {
if (release_metadata)
btrfs_delalloc_release_metadata(inode, inode->i_size);
#ifdef DEBUG
- btrfs_err(root->fs_info,
- "failed to write free ino cache for root %llu",
- root->root_key.objectid);
+ btrfs_err(fs_info,
+ "failed to write free ino cache for root %llu",
+ root->root_key.objectid);
#endif
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 363fdd955e5d..6f3c025a2c6c 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -59,7 +59,7 @@ int create_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
-int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
@@ -67,12 +67,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct inode *inode);
int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
-int btrfs_wait_cache_io(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
+int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path, u64 offset);
-int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_path *path);
+int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
@@ -111,7 +109,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 57401b474ec6..ff0c55337c2e 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -39,7 +39,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
*/
- bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
bitmap_range);
bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
@@ -189,7 +189,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int ret;
bitmap_size = free_space_bitmap_size(block_group->key.offset,
- block_group->sectorsize);
+ fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -227,9 +227,9 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ASSERT(found_key.objectid + found_key.offset <= end);
first = div_u64(found_key.objectid - start,
- block_group->sectorsize);
+ fs_info->sectorsize);
last = div_u64(found_key.objectid + found_key.offset - start,
- block_group->sectorsize);
+ fs_info->sectorsize);
le_bitmap_set(bitmap, first, last - first);
extent_count++;
@@ -270,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
}
bitmap_cursor = bitmap;
- bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
i = start;
while (i < end) {
unsigned long ptr;
@@ -279,7 +279,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
extent_size = min(end - i, bitmap_range);
data_size = free_space_bitmap_size(extent_size,
- block_group->sectorsize);
+ fs_info->sectorsize);
key.objectid = i;
key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
@@ -330,7 +330,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int ret;
bitmap_size = free_space_bitmap_size(block_group->key.offset,
- block_group->sectorsize);
+ fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -370,11 +370,11 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
ASSERT(found_key.objectid + found_key.offset <= end);
bitmap_pos = div_u64(found_key.objectid - start,
- block_group->sectorsize *
+ fs_info->sectorsize *
BITS_PER_BYTE);
bitmap_cursor = bitmap + bitmap_pos;
data_size = free_space_bitmap_size(found_key.offset,
- block_group->sectorsize);
+ fs_info->sectorsize);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
read_extent_buffer(leaf, bitmap_cursor, ptr,
@@ -425,7 +425,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
extent_count++;
}
prev_bit = bit;
- offset += block_group->sectorsize;
+ offset += fs_info->sectorsize;
bitnr++;
}
if (prev_bit == 1) {
@@ -517,7 +517,8 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group,
ASSERT(offset >= found_start && offset < found_end);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- i = div_u64(offset - found_start, block_group->sectorsize);
+ i = div_u64(offset - found_start,
+ block_group->fs_info->sectorsize);
return !!extent_buffer_test_bit(leaf, ptr, i);
}
@@ -525,6 +526,7 @@ static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
struct btrfs_key key;
u64 end = *start + *size;
@@ -544,8 +546,8 @@ static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
end = found_end;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- first = div_u64(*start - found_start, block_group->sectorsize);
- last = div_u64(end - found_start, block_group->sectorsize);
+ first = div_u64(*start - found_start, fs_info->sectorsize);
+ last = div_u64(end - found_start, fs_info->sectorsize);
if (bit)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
@@ -606,7 +608,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* that block is within the block group.
*/
if (start > block_group->key.objectid) {
- u64 prev_block = start - block_group->sectorsize;
+ u64 prev_block = start - block_group->fs_info->sectorsize;
key.objectid = prev_block;
key.type = (u8)-1;
@@ -1121,7 +1123,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
}
start = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- start += fs_info->tree_root->nodesize;
+ start += fs_info->nodesize;
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
@@ -1187,7 +1189,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
- ret = btrfs_commit_transaction(trans, tree_root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
@@ -1196,7 +1198,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -1267,7 +1269,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
- clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+ clean_tree_block(trans, fs_info, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
0, 1);
@@ -1276,7 +1278,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
free_extent_buffer(free_space_root->commit_root);
kfree(free_space_root);
- ret = btrfs_commit_transaction(trans, tree_root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
@@ -1284,7 +1286,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
abort:
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -1473,7 +1475,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
extent_count++;
}
prev_bit = bit;
- offset += block_group->sectorsize;
+ offset += fs_info->sectorsize;
}
}
if (prev_bit == 1) {
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b8acc07ac6c2..39c968f80157 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -182,7 +182,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + del_len,
item_size - (ptr + del_len - item_start));
- btrfs_truncate_item(root, path, item_size - del_len, 1);
+ btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1);
out:
btrfs_free_path(path);
@@ -245,7 +245,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
+ btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
@@ -297,7 +297,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
name, name_len, NULL))
goto out;
- btrfs_extend_item(root, path, ins_len);
+ btrfs_extend_item(root->fs_info, path, ins_len);
ret = 0;
}
if (ret < 0)
@@ -328,6 +328,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 index)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_inode_ref *ref;
@@ -354,7 +355,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- btrfs_extend_item(root, path, ins_len);
+ btrfs_extend_item(fs_info, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -384,7 +385,7 @@ out:
btrfs_free_path(path);
if (ret == -EMLINK) {
- struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
/* We ran out of space in the ref array. Need to
* add an extended ref. */
if (btrfs_super_incompat_flags(disk_super)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d27014b8bf72..144b119ff43f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,7 +38,7 @@ static int caching_kthread(void *data)
int slot;
int ret;
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -180,7 +180,7 @@ static void start_caching(struct btrfs_root *root)
if (IS_ERR(tsk)) {
btrfs_warn(fs_info, "failed to start inode caching task");
btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
- "disabling inode map caching");
+ "disabling inode map caching");
}
}
@@ -395,6 +395,7 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
@@ -415,7 +416,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (btrfs_root_refs(&root->root_item) == 0)
return 0;
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -423,7 +424,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
return -ENOMEM;
rsv = trans->block_rsv;
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
num_bytes = trans->bytes_reserved;
/*
@@ -433,14 +434,14 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
* 1 item for free space object
* 3 items for pre-allocation
*/
- trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
+ trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
if (ret)
goto out;
- trace_btrfs_space_reservation(root->fs_info, "ino_cache",
- trans->transid, trans->bytes_reserved, 1);
+ trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
+ trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
@@ -506,9 +507,10 @@ again:
out_put:
iput(inode);
out_release:
- trace_btrfs_space_reservation(root->fs_info, "ino_cache",
- trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
+ trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
+ trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ trans->bytes_reserved);
out:
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e3a5a266917..1e861a063721 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -30,7 +30,6 @@
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
@@ -250,11 +249,12 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
int compress_type,
struct page **compressed_pages)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 isize = i_size_read(inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, root->sectorsize);
+ u64 aligned_end = ALIGN(end, fs_info->sectorsize);
u64 data_len = inline_len;
int ret;
struct btrfs_path *path;
@@ -265,12 +265,12 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
data_len = compressed_size;
if (start > 0 ||
- actual_end > root->sectorsize ||
- data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+ actual_end > fs_info->sectorsize ||
+ data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
(!compressed_size &&
- (actual_end & (root->sectorsize - 1)) == 0) ||
+ (actual_end & (fs_info->sectorsize - 1)) == 0) ||
end + 1 < isize ||
- data_len > root->fs_info->max_inline) {
+ data_len > fs_info->max_inline) {
return 1;
}
@@ -283,7 +283,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ trans->block_rsv = &fs_info->delalloc_block_rsv;
if (compressed_size && compressed_pages)
extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -326,7 +326,7 @@ out:
*/
btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
btrfs_free_path(path);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -373,15 +373,15 @@ static noinline int add_async_extent(struct async_cow *cow,
static inline int inode_need_compress(struct inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
/* force compress */
- if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
+ if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* bad compression ratios */
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
- if (btrfs_test_opt(root->fs_info, COMPRESS) ||
+ if (btrfs_test_opt(fs_info, COMPRESS) ||
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
BTRFS_I(inode)->force_compress)
return 1;
@@ -411,9 +411,10 @@ static noinline void compress_file_range(struct inode *inode,
struct async_cow *async_cow,
int *num_added)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 num_bytes;
- u64 blocksize = root->sectorsize;
+ u64 blocksize = fs_info->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
int ret = 0;
@@ -426,7 +427,7 @@ static noinline void compress_file_range(struct inode *inode,
unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
- int compress_type = root->fs_info->compress_type;
+ int compress_type = fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
@@ -625,7 +626,7 @@ cont:
nr_pages_ret = 0;
/* flag the file so we don't compress in the future */
- if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
+ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
!(BTRFS_I(inode)->force_compress)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
}
@@ -683,6 +684,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
static noinline void submit_compressed_extents(struct inode *inode,
struct async_cow *async_cow)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
@@ -795,7 +797,7 @@ retry:
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = async_extent->ram_size;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -830,7 +832,7 @@ retry:
async_extent->ram_size - 1, 0);
goto out_free_reserve;
}
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
/*
* clear dirty, set writeback and unlock the pages.
@@ -871,8 +873,8 @@ retry:
}
return;
out_free_reserve:
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -940,13 +942,14 @@ static noinline int cow_file_range(struct inode *inode,
int *page_started, unsigned long *nr_written,
int unlock, struct btrfs_dedupe_hash *hash)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
u64 disk_num_bytes;
u64 cur_alloc_size;
- u64 blocksize = root->sectorsize;
+ u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -990,7 +993,7 @@ static noinline int cow_file_range(struct inode *inode,
}
BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(root->fs_info->super_copy));
+ btrfs_super_total_bytes(fs_info->super_copy));
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1000,7 +1003,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- root->sectorsize, 0, alloc_hint,
+ fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1021,7 +1024,7 @@ static noinline int cow_file_range(struct inode *inode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ram_size;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
em->generation = -1;
@@ -1053,7 +1056,7 @@ static noinline int cow_file_range(struct inode *inode,
goto out_drop_extent_cache;
}
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (disk_num_bytes < cur_alloc_size)
break;
@@ -1084,8 +1087,8 @@ out:
out_drop_extent_cache:
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
locked_page,
@@ -1119,6 +1122,7 @@ static noinline void async_cow_start(struct btrfs_work *work)
*/
static noinline void async_cow_submit(struct btrfs_work *work)
{
+ struct btrfs_fs_info *fs_info;
struct async_cow *async_cow;
struct btrfs_root *root;
unsigned long nr_pages;
@@ -1126,16 +1130,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
async_cow = container_of(work, struct async_cow, work);
root = async_cow->root;
+ fs_info = root->fs_info;
nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
PAGE_SHIFT;
/*
* atomic_sub_return implies a barrier for waitqueue_active
*/
- if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
5 * SZ_1M &&
- waitqueue_active(&root->fs_info->async_submit_wait))
- wake_up(&root->fs_info->async_submit_wait);
+ waitqueue_active(&fs_info->async_submit_wait))
+ wake_up(&fs_info->async_submit_wait);
if (async_cow->inode)
submit_compressed_extents(async_cow->inode, async_cow);
@@ -1154,6 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_cow *async_cow;
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
@@ -1171,7 +1177,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->start = start;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
- !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
+ !btrfs_test_opt(fs_info, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + SZ_512K - 1);
@@ -1186,22 +1192,21 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
nr_pages = (cur_end - start + PAGE_SIZE) >>
PAGE_SHIFT;
- atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+ atomic_add(nr_pages, &fs_info->async_delalloc_pages);
- btrfs_queue_work(root->fs_info->delalloc_workers,
- &async_cow->work);
+ btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
- if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
- wait_event(root->fs_info->async_submit_wait,
- (atomic_read(&root->fs_info->async_delalloc_pages) <
- limit));
+ if (atomic_read(&fs_info->async_delalloc_pages) > limit) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->async_delalloc_pages) <
+ limit));
}
- while (atomic_read(&root->fs_info->async_submit_draining) &&
- atomic_read(&root->fs_info->async_delalloc_pages)) {
- wait_event(root->fs_info->async_submit_wait,
- (atomic_read(&root->fs_info->async_delalloc_pages) ==
- 0));
+ while (atomic_read(&fs_info->async_submit_draining) &&
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->async_delalloc_pages) ==
+ 0));
}
*nr_written += nr_pages;
@@ -1211,14 +1216,14 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
return 0;
}
-static noinline int csum_exist_in_range(struct btrfs_root *root,
+static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
int ret;
struct btrfs_ordered_sum *sums;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+ ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1243,6 +1248,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 start, u64 end, int *page_started, int force,
unsigned long *nr_written)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct extent_buffer *leaf;
@@ -1298,7 +1304,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
return PTR_ERR(trans);
}
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ trans->block_rsv = &fs_info->delalloc_block_rsv;
cow_start = (u64)-1;
cur_offset = start;
@@ -1374,7 +1380,7 @@ next_slot:
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
- if (btrfs_extent_readonly(root, disk_bytenr))
+ if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
if (btrfs_cross_ref_exist(trans, root, ino,
found_key.offset -
@@ -1397,17 +1403,18 @@ next_slot:
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
- if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ if (csum_exist_in_range(fs_info, disk_bytenr,
+ num_bytes))
goto out_check;
- if (!btrfs_inc_nocow_writers(root->fs_info,
- disk_bytenr))
+ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
- extent_end = ALIGN(extent_end, root->sectorsize);
+ extent_end = ALIGN(extent_end,
+ fs_info->sectorsize);
} else {
BUG_ON(1);
}
@@ -1417,8 +1424,7 @@ out_check:
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
if (nocow)
- btrfs_dec_nocow_writers(root->fs_info,
- disk_bytenr);
+ btrfs_dec_nocow_writers(fs_info, disk_bytenr);
goto next_slot;
}
if (!nocow) {
@@ -1441,7 +1447,7 @@ out_check:
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
if (nocow)
- btrfs_dec_nocow_writers(root->fs_info,
+ btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
goto error;
}
@@ -1461,7 +1467,7 @@ out_check:
em->block_start = disk_bytenr;
em->orig_block_len = disk_num_bytes;
em->ram_bytes = ram_bytes;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
em->mod_start = em->start;
em->mod_len = em->len;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -1486,7 +1492,7 @@ out_check:
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
num_bytes, num_bytes, type);
if (nocow)
- btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
+ btrfs_dec_nocow_writers(fs_info, disk_bytenr);
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
@@ -1528,7 +1534,7 @@ out_check:
}
error:
- err = btrfs_end_transaction(trans, root);
+ err = btrfs_end_transaction(trans);
if (!ret)
ret = err;
@@ -1693,6 +1699,8 @@ static void btrfs_merge_extent_hook(struct inode *inode,
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
spin_lock(&root->delalloc_lock);
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
@@ -1701,11 +1709,11 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
&BTRFS_I(inode)->runtime_flags);
root->nr_delalloc_inodes++;
if (root->nr_delalloc_inodes == 1) {
- spin_lock(&root->fs_info->delalloc_root_lock);
+ spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(!list_empty(&root->delalloc_root));
list_add_tail(&root->delalloc_root,
- &root->fs_info->delalloc_roots);
- spin_unlock(&root->fs_info->delalloc_root_lock);
+ &fs_info->delalloc_roots);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
}
spin_unlock(&root->delalloc_lock);
@@ -1714,6 +1722,8 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
static void btrfs_del_delalloc_inode(struct btrfs_root *root,
struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
spin_lock(&root->delalloc_lock);
if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1721,10 +1731,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
&BTRFS_I(inode)->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
- spin_lock(&root->fs_info->delalloc_root_lock);
+ spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
- spin_unlock(&root->fs_info->delalloc_root_lock);
+ spin_unlock(&fs_info->delalloc_root_lock);
}
}
spin_unlock(&root->delalloc_lock);
@@ -1739,6 +1749,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
struct extent_state *state, unsigned *bits)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
@@ -1760,11 +1772,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
}
/* For sanity tests */
- if (btrfs_is_testing(root->fs_info))
+ if (btrfs_is_testing(fs_info))
return;
- __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
- root->fs_info->delalloc_batch);
+ __percpu_counter_add(&fs_info->delalloc_bytes, len,
+ fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (*bits & EXTENT_DEFRAG)
@@ -1783,6 +1795,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
unsigned *bits)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 len = state->end + 1 - state->start;
u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
BTRFS_MAX_EXTENT_SIZE);
@@ -1815,11 +1828,11 @@ static void btrfs_clear_bit_hook(struct inode *inode,
* error.
*/
if (*bits & EXTENT_DO_ACCOUNTING &&
- root != root->fs_info->tree_root)
+ root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
/* For sanity tests. */
- if (btrfs_is_testing(root->fs_info))
+ if (btrfs_is_testing(fs_info))
return;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -1829,8 +1842,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_free_reserved_data_space_noquota(inode,
state->start, len);
- __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
- root->fs_info->delalloc_batch);
+ __percpu_counter_add(&fs_info->delalloc_bytes, -len,
+ fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes -= len;
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
@@ -1853,7 +1866,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags)
{
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
@@ -1864,8 +1878,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
- &map_length, NULL, 0);
+ ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ NULL, 0);
if (ret < 0)
return ret;
if (map_length < length + size)
@@ -1885,10 +1899,9 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
- ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(inode, bio, 0, 0);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -1905,10 +1918,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
- ret = btrfs_map_bio(root, bio, mirror_num, 1);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
if (ret) {
bio->bi_error = ret;
bio_endio(bio);
@@ -1924,6 +1937,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
int ret = 0;
@@ -1936,7 +1950,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (bio_op(bio) != REQ_OP_WRITE) {
- ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
@@ -1946,7 +1960,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
bio_flags);
goto out;
} else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret)
goto out;
}
@@ -1956,20 +1970,19 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
- inode, bio, mirror_num,
- bio_flags, bio_offset,
- __btrfs_submit_bio_start,
- __btrfs_submit_bio_done);
+ ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
+ bio_flags, bio_offset,
+ __btrfs_submit_bio_start,
+ __btrfs_submit_bio_done);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+ ret = btrfs_csum_one_bio(inode, bio, 0, 0);
if (ret)
goto out;
}
mapit:
- ret = btrfs_map_bio(root, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
out:
if (ret < 0) {
@@ -2090,8 +2103,8 @@ out_page:
static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- struct btrfs_root *root = BTRFS_I(inode)->root;
/* this page is properly in the ordered list */
if (TestClearPagePrivate2(page))
@@ -2109,7 +2122,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
btrfs_init_work(&fixup->work, btrfs_fixup_helper,
btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
+ btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EBUSY;
}
@@ -2180,10 +2193,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_alloc_reserved_file_extent(trans, root,
- root->root_key.objectid,
- btrfs_ino(inode), file_pos,
- ram_bytes, &ins);
+ ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
+ btrfs_ino(inode), file_pos,
+ ram_bytes, &ins);
/*
* Release the reserved range from inode dirty range map, as it is
* already moved into delayed_ref_head
@@ -2293,7 +2305,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
void *ctx)
{
struct btrfs_file_extent_item *extent;
- struct btrfs_fs_info *fs_info;
struct old_sa_defrag_extent *old = ctx;
struct new_sa_defrag_extent *new = old->new;
struct btrfs_path *path = new->path;
@@ -2302,6 +2313,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
struct sa_defrag_extent_backref *backref;
struct extent_buffer *leaf;
struct inode *inode = new->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int slot;
int ret;
u64 extent_offset;
@@ -2315,7 +2327,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- fs_info = BTRFS_I(inode)->root->fs_info;
root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(root)) {
if (PTR_ERR(root) == -ENOENT)
@@ -2413,7 +2424,7 @@ out:
static noinline bool record_extent_backrefs(struct btrfs_path *path,
struct new_sa_defrag_extent *new)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
struct old_sa_defrag_extent *old, *tmp;
int ret;
@@ -2471,13 +2482,12 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
struct btrfs_file_extent_item *item;
struct btrfs_ordered_extent *ordered;
struct btrfs_trans_handle *trans;
- struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
struct extent_buffer *leaf;
struct old_sa_defrag_extent *old = backref->old;
struct new_sa_defrag_extent *new = old->new;
- struct inode *src_inode = new->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
struct inode *inode;
struct extent_state *cached = NULL;
int ret = 0;
@@ -2498,7 +2508,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- fs_info = BTRFS_I(src_inode)->root->fs_info;
index = srcu_read_lock(&fs_info->subvol_srcu);
root = btrfs_read_fs_root_no_name(fs_info, &key);
@@ -2643,7 +2652,7 @@ again:
inode_add_bytes(inode, len);
btrfs_release_path(path);
- ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+ ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
new->disk_len, 0,
backref->root_id, backref->inum,
new->file_pos); /* start - extent_offset */
@@ -2656,7 +2665,7 @@ again:
out_free_path:
btrfs_release_path(path);
path->leave_spinning = 0;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
&cached, GFP_NOFS);
@@ -2679,6 +2688,7 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
static void relink_file_extents(struct new_sa_defrag_extent *new)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
struct btrfs_path *path;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
@@ -2725,14 +2735,15 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
out:
free_sa_defrag_extent(new);
- atomic_dec(&root->fs_info->defrag_running);
- wake_up(&root->fs_info->transaction_wait);
+ atomic_dec(&fs_info->defrag_running);
+ wake_up(&fs_info->transaction_wait);
}
static struct new_sa_defrag_extent *
record_old_file_extents(struct inode *inode,
struct btrfs_ordered_extent *ordered)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2831,7 +2842,7 @@ next:
}
btrfs_free_path(path);
- atomic_inc(&root->fs_info->defrag_running);
+ atomic_inc(&fs_info->defrag_running);
return new;
@@ -2842,12 +2853,12 @@ out_kfree:
return NULL;
}
-static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
struct btrfs_block_group_cache *cache;
- cache = btrfs_lookup_block_group(root->fs_info, start);
+ cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
spin_lock(&cache->lock);
@@ -2864,6 +2875,7 @@ static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct inode *inode = ordered_extent->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -2914,7 +2926,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out;
}
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ trans->block_rsv = &fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@@ -2949,7 +2961,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out_unlock;
}
- trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+ trans->block_rsv = &fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -2960,7 +2972,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset +
logical_len);
} else {
- BUG_ON(root == root->fs_info->tree_root);
+ BUG_ON(root == fs_info->tree_root);
ret = insert_reserved_file_extent(trans, inode,
ordered_extent->file_offset,
ordered_extent->start,
@@ -2969,7 +2981,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
if (!ret)
- btrfs_release_delalloc_bytes(root,
+ btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
}
@@ -2996,10 +3008,10 @@ out_unlock:
ordered_extent->file_offset +
ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
- if (root != root->fs_info->tree_root)
+ if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
if (trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret || truncated) {
u64 start, end;
@@ -3023,7 +3035,8 @@ out:
if ((ret || !logical_len) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
- btrfs_free_reserved_extent(root, ordered_extent->start,
+ btrfs_free_reserved_extent(fs_info,
+ ordered_extent->start,
ordered_extent->disk_len, 1);
}
@@ -3038,7 +3051,7 @@ out:
if (new) {
if (ret) {
free_sa_defrag_extent(new);
- atomic_dec(&root->fs_info->defrag_running);
+ atomic_dec(&fs_info->defrag_running);
} else {
relink_file_extents(new);
}
@@ -3063,7 +3076,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
struct inode *inode = page->mapping->host;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
btrfs_work_func_t func;
@@ -3076,10 +3089,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
if (btrfs_is_free_space_inode(inode)) {
- wq = root->fs_info->endio_freespace_worker;
+ wq = fs_info->endio_freespace_worker;
func = btrfs_freespace_write_helper;
} else {
- wq = root->fs_info->endio_write_workers;
+ wq = fs_info->endio_write_workers;
func = btrfs_endio_write_helper;
}
@@ -3103,7 +3116,7 @@ static int __readpage_endio_check(struct inode *inode,
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(kaddr + pgoff, csum, len);
- btrfs_csum_final(csum, (char *)&csum);
+ btrfs_csum_final(csum, (u8 *)&csum);
if (csum != csum_expected)
goto zeroit;
@@ -3156,7 +3169,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
void btrfs_add_delayed_iput(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
@@ -3172,9 +3185,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
spin_unlock(&fs_info->delayed_iput_lock);
}
-void btrfs_run_delayed_iputs(struct btrfs_root *root)
+void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->delayed_iput_lock);
while (!list_empty(&fs_info->delayed_iputs)) {
@@ -3204,6 +3216,7 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -3228,7 +3241,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
btrfs_root_refs(&root->root_item) > 0) {
- ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
root->root_key.objectid);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -3239,7 +3252,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
if (block_rsv) {
WARN_ON(block_rsv->size > 0);
- btrfs_free_block_rsv(root, block_rsv);
+ btrfs_free_block_rsv(fs_info, block_rsv);
}
}
@@ -3252,6 +3265,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *block_rsv = NULL;
int reserve = 0;
@@ -3259,7 +3273,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int ret;
if (!root->orphan_block_rsv) {
- block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ block_rsv = btrfs_alloc_block_rsv(fs_info,
+ BTRFS_BLOCK_RSV_TEMP);
if (!block_rsv)
return -ENOMEM;
}
@@ -3268,7 +3283,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (!root->orphan_block_rsv) {
root->orphan_block_rsv = block_rsv;
} else if (block_rsv) {
- btrfs_free_block_rsv(root, block_rsv);
+ btrfs_free_block_rsv(fs_info, block_rsv);
block_rsv = NULL;
}
@@ -3331,7 +3346,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* insert an orphan item to track subvolume contains orphan files */
if (insert >= 2) {
- ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
root->root_key.objectid);
if (ret && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
@@ -3382,6 +3397,7 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
*/
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
@@ -3441,8 +3457,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*/
if (found_key.offset == last_objectid) {
- btrfs_err(root->fs_info,
- "Error removing orphan entry, stopping orphan cleanup");
+ btrfs_err(fs_info,
+ "Error removing orphan entry, stopping orphan cleanup");
ret = -EINVAL;
goto out;
}
@@ -3452,12 +3468,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
- if (ret == -ENOENT && root == root->fs_info->tree_root) {
+ if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
@@ -3499,11 +3515,11 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_debug(root->fs_info, "auto deleting %Lu",
- found_key.objectid);
+ btrfs_debug(fs_info, "auto deleting %Lu",
+ found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret)
goto out;
continue;
@@ -3533,7 +3549,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
}
ret = btrfs_orphan_add(trans, inode);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret) {
iput(inode);
goto out;
@@ -3557,25 +3573,24 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
if (root->orphan_block_rsv)
- btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
(u64)-1);
if (root->orphan_block_rsv ||
test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
}
if (nr_unlink)
- btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
+ btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
if (nr_truncate)
- btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
+ btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
out:
if (ret)
- btrfs_err(root->fs_info,
- "could not do orphan cleanup %d", ret);
+ btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
}
@@ -3654,6 +3669,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
*/
static int btrfs_read_locked_inode(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
@@ -3734,7 +3750,7 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in delayed_nodes_tree.
*/
- if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+ if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -3800,7 +3816,7 @@ cache_acl:
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(inode),
root->root_key.objectid, ret);
@@ -3819,10 +3835,7 @@ cache_acl:
break;
case S_IFDIR:
inode->i_fop = &btrfs_dir_file_operations;
- if (root == root->fs_info->tree_root)
- inode->i_op = &btrfs_dir_ro_inode_operations;
- else
- inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
@@ -3937,6 +3950,7 @@ failed:
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
/*
@@ -3948,7 +3962,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
*/
if (!btrfs_is_free_space_inode(inode)
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+ && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -3982,6 +3996,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
const char *name, int name_len)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
struct extent_buffer *leaf;
@@ -4036,14 +4051,14 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
dir_ino, &index);
if (ret) {
- btrfs_info(root->fs_info,
+ btrfs_info(fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
name_len, name, ino, dir_ino);
btrfs_abort_transaction(trans, ret);
goto err;
}
skip_backref:
- ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+ ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
@@ -4138,8 +4153,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
}
out:
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(root->fs_info);
return ret;
}
@@ -4148,6 +4163,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct inode *dir, u64 objectid,
const char *name, int name_len)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
@@ -4180,9 +4196,9 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
- objectid, root->root_key.objectid,
- dir_ino, &index, name, name_len);
+ ret = btrfs_del_root_ref(trans, fs_info, objectid,
+ root->root_key.objectid, dir_ino,
+ &index, name, name_len);
if (ret < 0) {
if (ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
@@ -4206,7 +4222,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+ ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -4274,8 +4290,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
}
out:
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(root->fs_info);
return err;
}
@@ -4284,18 +4300,19 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytes_deleted)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
/*
* This is only used to apply pressure to the enospc system, we don't
* intend to use this reservation at all.
*/
- bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
- bytes_deleted *= root->nodesize;
- ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+ bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
+ bytes_deleted *= fs_info->nodesize;
+ ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
- trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
bytes_deleted, 1);
trans->bytes_reserved += bytes_deleted;
@@ -4338,7 +4355,7 @@ static int truncate_inline_extent(struct inode *inode,
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root, path, size, 1);
+ btrfs_truncate_item(root->fs_info, path, size, 1);
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
@@ -4362,6 +4379,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 new_size, u32 min_type)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
@@ -4407,9 +4425,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* extent just the way it is.
*/
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == root->fs_info->tree_root)
+ root == fs_info->tree_root)
btrfs_drop_extent_cache(inode, ALIGN(new_size,
- root->sectorsize), (u64)-1, 0);
+ fs_info->sectorsize),
+ (u64)-1, 0);
/*
* This function is also used to drop the items in the log tree before
@@ -4431,7 +4450,7 @@ search_again:
* bytes_deleted is > 0, it will be huge by the time we get here
*/
if (be_nice && bytes_deleted > SZ_32M) {
- if (btrfs_should_end_transaction(trans, root)) {
+ if (btrfs_should_end_transaction(trans)) {
err = -EAGAIN;
goto error;
}
@@ -4483,8 +4502,19 @@ search_again:
if (found_type > min_type) {
del_item = 1;
} else {
- if (item_end < new_size)
+ if (item_end < new_size) {
+ /*
+ * With NO_HOLES mode, for the following mapping
+ *
+ * [0-4k][hole][8k-12k]
+ *
+ * if truncating isize down to 6k, it ends up
+ * isize being 8k.
+ */
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ last_size = new_size;
break;
+ }
if (found_key.offset >= new_size)
del_item = 1;
else
@@ -4508,7 +4538,7 @@ search_again:
btrfs_file_extent_num_bytes(leaf, fi);
extent_num_bytes = ALIGN(new_size -
found_key.offset,
- root->sectorsize);
+ fs_info->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
@@ -4595,16 +4625,16 @@ delete:
if (found_extent &&
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == root->fs_info->tree_root)) {
+ root == fs_info->tree_root)) {
btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
- ret = btrfs_free_extent(trans, root, extent_start,
+ ret = btrfs_free_extent(trans, fs_info, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset);
BUG_ON(ret);
- if (btrfs_should_throttle_delayed_refs(trans, root))
- btrfs_async_run_delayed_refs(root,
+ if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+ btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
if (be_nice) {
@@ -4613,9 +4643,8 @@ delete:
should_end = 1;
}
if (btrfs_should_throttle_delayed_refs(trans,
- root)) {
+ fs_info))
should_throttle = 1;
- }
}
}
@@ -4640,7 +4669,9 @@ delete:
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ ret = btrfs_run_delayed_refs(trans,
+ fs_info,
+ updates * 2);
if (ret && !err)
err = ret;
}
@@ -4675,7 +4706,8 @@ error:
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ ret = btrfs_run_delayed_refs(trans, fs_info,
+ updates * 2);
if (ret && !err)
err = ret;
}
@@ -4697,13 +4729,13 @@ error:
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
char *kaddr;
- u32 blocksize = root->sectorsize;
+ u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
@@ -4807,6 +4839,7 @@ out:
static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
u64 offset, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
int ret;
@@ -4814,8 +4847,8 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* Still need to make sure the inode looks like it's been updated so
* that any holes get logged if we fsync.
*/
- if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
- BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ BTRFS_I(inode)->last_trans = fs_info->generation;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
BTRFS_I(inode)->last_log_commit = root->last_log_commit;
return 0;
@@ -4833,7 +4866,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
if (ret) {
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -4843,7 +4876,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
btrfs_abort_transaction(trans, ret);
else
btrfs_update_inode(trans, root, inode);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -4855,13 +4888,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
*/
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 hole_start = ALIGN(oldsize, root->sectorsize);
- u64 block_end = ALIGN(size, root->sectorsize);
+ u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
+ u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
u64 cur_offset;
u64 hole_size;
@@ -4904,7 +4938,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
break;
}
last_byte = min(extent_map_end(em), block_end);
- last_byte = ALIGN(last_byte , root->sectorsize);
+ last_byte = ALIGN(last_byte, fs_info->sectorsize);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
@@ -4929,9 +4963,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
- hole_em->generation = root->fs_info->generation;
+ hole_em->generation = fs_info->generation;
while (1) {
write_lock(&em_tree->lock);
@@ -5006,7 +5040,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_write_no_snapshoting(root);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
} else {
/*
@@ -5037,7 +5071,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* will be consistent.
*/
ret = btrfs_orphan_add(trans, inode);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret)
return ret;
@@ -5068,7 +5102,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
err = btrfs_orphan_del(trans, inode);
if (err)
btrfs_abort_transaction(trans, err);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
}
}
@@ -5201,6 +5235,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
void btrfs_evict_inode(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
@@ -5215,7 +5250,7 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
- min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
evict_inode_truncate_pages(inode);
@@ -5235,7 +5270,7 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_free_io_failure_record(inode, 0, (u64)-1);
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags));
goto no_delete;
@@ -5253,14 +5288,14 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
- rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
btrfs_orphan_del(NULL, inode);
goto no_delete;
}
rsv->size = min_size;
rsv->failfast = 1;
- global_rsv = &root->fs_info->global_block_rsv;
+ global_rsv = &fs_info->global_block_rsv;
btrfs_i_size_write(inode, 0);
@@ -5294,18 +5329,18 @@ void btrfs_evict_inode(struct inode *inode)
* steal_from_global == 3: abandon all hope!
*/
if (steal_from_global > 2) {
- btrfs_warn(root->fs_info,
- "Could not get space for a delete, will truncate on mount %d",
- ret);
+ btrfs_warn(fs_info,
+ "Could not get space for a delete, will truncate on mount %d",
+ ret);
btrfs_orphan_del(NULL, inode);
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
goto no_delete;
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_orphan_del(NULL, inode);
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
goto no_delete;
}
@@ -5315,7 +5350,7 @@ void btrfs_evict_inode(struct inode *inode)
* again.
*/
if (steal_from_global) {
- if (!btrfs_check_space_for_delayed_refs(trans, root))
+ if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
ret = btrfs_block_rsv_migrate(global_rsv, rsv,
min_size, 0);
else
@@ -5328,10 +5363,10 @@ void btrfs_evict_inode(struct inode *inode)
* again.
*/
if (ret) {
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret) {
btrfs_orphan_del(NULL, inode);
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
goto no_delete;
}
continue;
@@ -5345,13 +5380,13 @@ void btrfs_evict_inode(struct inode *inode)
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- btrfs_end_transaction(trans, root);
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ btrfs_end_transaction(trans);
trans = NULL;
- btrfs_btree_balance_dirty(root);
+ btrfs_btree_balance_dirty(fs_info);
}
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
/*
* Errors here aren't a big deal, it just means we leave orphan items
@@ -5364,13 +5399,13 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_orphan_del(NULL, inode);
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- if (!(root == root->fs_info->tree_root ||
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ if (!(root == fs_info->tree_root ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
btrfs_return_ino(root, btrfs_ino(inode));
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
@@ -5416,7 +5451,7 @@ out_err:
* needs to be changed to reflect the root directory of the tree root. This
* is kind of like crossing a mount point.
*/
-static int fixup_tree_root_location(struct btrfs_root *root,
+static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct inode *dir,
struct dentry *dentry,
struct btrfs_key *location,
@@ -5441,8 +5476,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
- ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
- 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret) {
if (ret < 0)
err = ret;
@@ -5463,7 +5497,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
btrfs_release_path(path);
- new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ new_root = btrfs_read_fs_root_no_name(fs_info, location);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@@ -5517,6 +5551,7 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int empty = 0;
@@ -5529,7 +5564,7 @@ static void inode_tree_del(struct inode *inode)
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- synchronize_srcu(&root->fs_info->subvol_srcu);
+ synchronize_srcu(&fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
@@ -5540,13 +5575,14 @@ static void inode_tree_del(struct inode *inode)
void btrfs_invalidate_inodes(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
struct rb_node *prev;
struct btrfs_inode *entry;
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -5682,6 +5718,7 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
inode->i_op = &btrfs_dir_ro_inode_operations;
+ inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
inode->i_mtime = current_time(inode);
@@ -5694,6 +5731,7 @@ static struct inode *new_simple_dir(struct super_block *s,
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
@@ -5718,8 +5756,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
- index = srcu_read_lock(&root->fs_info->subvol_srcu);
- ret = fixup_tree_root_location(root, dir, dentry,
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
@@ -5729,13 +5767,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
} else {
inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
}
- srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
if (!IS_ERR(inode) && root != sub_root) {
- down_read(&root->fs_info->cleanup_work_sem);
+ down_read(&fs_info->cleanup_work_sem);
if (!(inode->i_sb->s_flags & MS_RDONLY))
ret = btrfs_orphan_cleanup(sub_root);
- up_read(&root->fs_info->cleanup_work_sem);
+ up_read(&fs_info->cleanup_work_sem);
if (ret) {
iput(inode);
inode = ERR_PTR(ret);
@@ -5792,6 +5830,7 @@ unsigned char btrfs_filetype_table[] = {
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_item *item;
struct btrfs_dir_item *di;
@@ -5805,20 +5844,11 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
int slot;
unsigned char d_type;
int over = 0;
- u32 di_cur;
- u32 di_total;
- u32 di_len;
- int key_type = BTRFS_DIR_INDEX_KEY;
char tmp_name[32];
char *name_ptr;
int name_len;
- int is_curr = 0; /* ctx->pos points to the current index? */
- bool emitted;
bool put = false;
-
- /* FIXME, use a real flag for deciding about the key type */
- if (root->fs_info->tree_root == root)
- key_type = BTRFS_DIR_ITEM_KEY;
+ struct btrfs_key location;
if (!dir_emit_dots(file, ctx))
return 0;
@@ -5829,14 +5859,11 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
path->reada = READA_FORWARD;
- if (key_type == BTRFS_DIR_INDEX_KEY) {
- INIT_LIST_HEAD(&ins_list);
- INIT_LIST_HEAD(&del_list);
- put = btrfs_readdir_get_delayed_items(inode, &ins_list,
- &del_list);
- }
+ INIT_LIST_HEAD(&ins_list);
+ INIT_LIST_HEAD(&del_list);
+ put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
- key.type = key_type;
+ key.type = BTRFS_DIR_INDEX_KEY;
key.offset = ctx->pos;
key.objectid = btrfs_ino(inode);
@@ -5844,7 +5871,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (ret < 0)
goto err;
- emitted = false;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
@@ -5862,98 +5888,52 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (found_key.objectid != key.objectid)
break;
- if (found_key.type != key_type)
+ if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
goto next;
- if (key_type == BTRFS_DIR_INDEX_KEY &&
- btrfs_should_delete_dir_index(&del_list,
- found_key.offset))
+ if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
goto next;
ctx->pos = found_key.offset;
- is_curr = 1;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- di_cur = 0;
- di_total = btrfs_item_size(leaf, item);
-
- while (di_cur < di_total) {
- struct btrfs_key location;
-
- if (verify_dir_item(root, leaf, di))
- break;
+ if (verify_dir_item(fs_info, leaf, di))
+ goto next;
- name_len = btrfs_dir_name_len(leaf, di);
- if (name_len <= sizeof(tmp_name)) {
- name_ptr = tmp_name;
- } else {
- name_ptr = kmalloc(name_len, GFP_KERNEL);
- if (!name_ptr) {
- ret = -ENOMEM;
- goto err;
- }
+ name_len = btrfs_dir_name_len(leaf, di);
+ if (name_len <= sizeof(tmp_name)) {
+ name_ptr = tmp_name;
+ } else {
+ name_ptr = kmalloc(name_len, GFP_KERNEL);
+ if (!name_ptr) {
+ ret = -ENOMEM;
+ goto err;
}
- read_extent_buffer(leaf, name_ptr,
- (unsigned long)(di + 1), name_len);
-
- d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
- btrfs_dir_item_key_to_cpu(leaf, di, &location);
+ }
+ read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
+ name_len);
+ d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+ btrfs_dir_item_key_to_cpu(leaf, di, &location);
- /* is this a reference to our own snapshot? If so
- * skip it.
- *
- * In contrast to old kernels, we insert the snapshot's
- * dir item and dir index after it has been created, so
- * we won't find a reference to our own snapshot. We
- * still keep the following code for backward
- * compatibility.
- */
- if (location.type == BTRFS_ROOT_ITEM_KEY &&
- location.objectid == root->root_key.objectid) {
- over = 0;
- goto skip;
- }
- over = !dir_emit(ctx, name_ptr, name_len,
- location.objectid, d_type);
+ over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
+ d_type);
-skip:
- if (name_ptr != tmp_name)
- kfree(name_ptr);
+ if (name_ptr != tmp_name)
+ kfree(name_ptr);
- if (over)
- goto nopos;
- emitted = true;
- di_len = btrfs_dir_name_len(leaf, di) +
- btrfs_dir_data_len(leaf, di) + sizeof(*di);
- di_cur += di_len;
- di = (struct btrfs_dir_item *)((char *)di + di_len);
- }
+ if (over)
+ goto nopos;
+ ctx->pos++;
next:
path->slots[0]++;
}
- if (key_type == BTRFS_DIR_INDEX_KEY) {
- if (is_curr)
- ctx->pos++;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
- if (ret)
- goto nopos;
- }
-
- /*
- * If we haven't emitted any dir entry, we must not touch ctx->pos as
- * it was was set to the termination value in previous call. We assume
- * that "." and ".." were emitted if we reach this point and set the
- * termination value as well for an empty directory.
- */
- if (ctx->pos > 2 && !emitted)
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ if (ret)
goto nopos;
- /* Reached end of directory/root. Bump pos past the last item. */
- ctx->pos++;
-
/*
* Stop new entries from being returned after we return the last
* entry.
@@ -5971,12 +5951,10 @@ next:
* last entry requires it because doing so has broken 32bit apps
* in the past.
*/
- if (key_type == BTRFS_DIR_INDEX_KEY) {
- if (ctx->pos >= INT_MAX)
- ctx->pos = LLONG_MAX;
- else
- ctx->pos = INT_MAX;
- }
+ if (ctx->pos >= INT_MAX)
+ ctx->pos = LLONG_MAX;
+ else
+ ctx->pos = INT_MAX;
nopos:
ret = 0;
err:
@@ -6006,7 +5984,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
}
return ret;
}
@@ -6019,6 +5997,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
static int btrfs_dirty_inode(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret;
@@ -6033,16 +6012,16 @@ static int btrfs_dirty_inode(struct inode *inode)
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret == -ENOSPC) {
/* whoops, lets try again with the full transaction */
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, inode);
}
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (BTRFS_I(inode)->delayed_node)
- btrfs_balance_delayed_items(root);
+ btrfs_balance_delayed_items(fs_info);
return ret;
}
@@ -6168,6 +6147,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
u64 ref_objectid, u64 objectid,
umode_t mode, u64 *index)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
@@ -6183,7 +6163,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (!path)
return ERR_PTR(-ENOMEM);
- inode = new_inode(root->fs_info->sb);
+ inode = new_inode(fs_info->sb);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
@@ -6277,7 +6257,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+ memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
@@ -6296,9 +6276,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) {
- if (btrfs_test_opt(root->fs_info, NODATASUM))
+ if (btrfs_test_opt(fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root->fs_info, NODATACOW))
+ if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
}
@@ -6312,7 +6292,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
ret = btrfs_inode_inherit_props(trans, inode, dir);
if (ret)
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
btrfs_ino(inode), root->root_key.objectid, ret);
@@ -6343,6 +6323,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
@@ -6358,9 +6339,9 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
}
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- key.objectid, root->root_key.objectid,
- parent_ino, index, name, name_len);
+ ret = btrfs_add_root_ref(trans, fs_info, key.objectid,
+ root->root_key.objectid, parent_ino,
+ index, name, name_len);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
parent_ino, index);
@@ -6394,9 +6375,9 @@ fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
int err;
- err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
- key.objectid, root->root_key.objectid,
- parent_ino, &local_index, name, name_len);
+ err = btrfs_del_root_ref(trans, fs_info, key.objectid,
+ root->root_key.objectid, parent_ino,
+ &local_index, name, name_len);
} else if (add_backref) {
u64 local_index;
@@ -6423,6 +6404,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
@@ -6475,9 +6457,9 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
- btrfs_end_transaction(trans, root);
- btrfs_balance_delayed_items(root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_balance_delayed_items(fs_info);
+ btrfs_btree_balance_dirty(fs_info);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -6494,6 +6476,7 @@ out_unlock_inode:
static int btrfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
@@ -6550,13 +6533,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
d_instantiate(dentry, inode);
out_unlock:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (err && drop_inode_on_err) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(root);
- btrfs_btree_balance_dirty(root);
+ btrfs_balance_delayed_items(fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
out_unlock_inode:
@@ -6571,6 +6554,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 index;
int err;
int drop_inode = 0;
@@ -6628,20 +6612,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_balance_delayed_items(root);
+ btrfs_balance_delayed_items(fs_info);
fail:
if (trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -6699,13 +6684,13 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
drop_on_err = 0;
out_fail:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (drop_on_err) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_balance_delayed_items(root);
- btrfs_btree_balance_dirty(root);
+ btrfs_balance_delayed_items(fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
out_fail_inode:
@@ -6820,6 +6805,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
size_t pg_offset, u64 start, u64 len,
int create)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
int err = 0;
u64 extent_start = 0;
@@ -6841,7 +6827,7 @@ again:
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
if (em) {
@@ -6857,7 +6843,7 @@ again:
err = -ENOMEM;
goto out;
}
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
@@ -6916,7 +6902,8 @@ again:
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
- extent_end = ALIGN(extent_start + size, root->sectorsize);
+ extent_end = ALIGN(extent_start + size,
+ fs_info->sectorsize);
}
next:
if (start >= extent_end) {
@@ -6965,7 +6952,7 @@ next:
copy_size = min_t(u64, PAGE_SIZE - pg_offset,
size - extent_offset);
em->start = extent_start + extent_offset;
- em->len = ALIGN(copy_size, root->sectorsize);
+ em->len = ALIGN(copy_size, fs_info->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
@@ -7024,7 +7011,7 @@ not_found_em:
insert:
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
err = -EIO;
@@ -7049,11 +7036,11 @@ insert:
* extent causing the -EEXIST.
*/
if (existing->start == em->start &&
- extent_map_end(existing) == extent_map_end(em) &&
+ extent_map_end(existing) >= extent_map_end(em) &&
em->block_start == existing->block_start) {
/*
- * these two extents are the same, it happens
- * with inlines especially
+ * The existing extent map already encompasses the
+ * entire extent map we tried to add.
*/
free_extent_map(em);
em = existing;
@@ -7081,11 +7068,11 @@ insert:
write_unlock(&em_tree->lock);
out:
- trace_btrfs_get_extent(root, em);
+ trace_btrfs_get_extent(root, inode, em);
btrfs_free_path(path);
if (trans) {
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
if (!err)
err = ret;
}
@@ -7237,7 +7224,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
struct extent_map *em = NULL;
int ret;
- down_read(&BTRFS_I(inode)->dio_sem);
if (type != BTRFS_ORDERED_NOCOW) {
em = create_pinned_em(inode, start, len, orig_start,
block_start, block_len, orig_block_len,
@@ -7256,7 +7242,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
em = ERR_PTR(ret);
}
out:
- up_read(&BTRFS_I(inode)->dio_sem);
return em;
}
@@ -7264,6 +7249,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
u64 start, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em;
struct btrfs_key ins;
@@ -7271,17 +7257,18 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
- ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
- alloc_hint, &ins, 1, 1);
+ ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+ 0, alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = btrfs_create_dio_extent(inode, start, ins.offset, start,
ins.objectid, ins.offset, ins.offset,
ins.offset, 0);
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid,
+ ins.offset, 1);
return em;
}
@@ -7294,6 +7281,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
int ret;
@@ -7374,14 +7362,15 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- if (btrfs_extent_readonly(root, disk_bytenr))
+ if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out;
num_bytes = min(offset + *len, extent_end) - offset;
if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+ range_end = round_up(offset + num_bytes,
+ root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
if (ret) {
@@ -7404,7 +7393,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
key.offset - backref_offset, disk_bytenr);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret) {
ret = 0;
goto out;
@@ -7418,8 +7407,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
- if (csum_exist_in_range(root, disk_bytenr, num_bytes))
- goto out;
+ if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
+ goto out;
/*
* all of the above have passed, it is safe to overwrite this extent
* without cow
@@ -7641,11 +7630,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode,
* within our reservation, otherwise we need to adjust our inode
* counter appropriately.
*/
- if (dio_data->outstanding_extents) {
+ if (dio_data->outstanding_extents >= num_extents) {
dio_data->outstanding_extents -= num_extents;
} else {
+ /*
+ * If dio write length has been split due to no large enough
+ * contiguous space, we need to compensate our inode counter
+ * appropriately.
+ */
+ u64 num_needed = num_extents - dio_data->outstanding_extents;
+
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents += num_extents;
+ BTRFS_I(inode)->outstanding_extents += num_needed;
spin_unlock(&BTRFS_I(inode)->lock);
}
}
@@ -7653,8 +7649,8 @@ static void adjust_dio_outstanding_extents(struct inode *inode,
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
u64 start = iblock << inode->i_blkbits;
@@ -7666,7 +7662,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (create)
unlock_bits |= EXTENT_DIRTY;
else
- len = min_t(u64, len, root->sectorsize);
+ len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
@@ -7755,14 +7751,14 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (can_nocow_extent(inode, start, &len, &orig_start,
&orig_block_len, &ram_bytes) == 1 &&
- btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+ btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
em2 = btrfs_create_dio_extent(inode, start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
- btrfs_dec_nocow_writers(root->fs_info, block_start);
+ btrfs_dec_nocow_writers(fs_info, block_start);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
em = em2;
@@ -7855,19 +7851,18 @@ err:
static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
int mirror_num)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
bio_get(bio);
- ret = btrfs_bio_wq_end_io(root->fs_info, bio,
- BTRFS_WQ_ENDIO_DIO_REPAIR);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
if (ret)
goto err;
- ret = btrfs_map_bio(root, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
err:
bio_put(bio);
return ret;
@@ -7917,7 +7912,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec;
struct bio *bio;
int isector;
- int read_mode;
+ int read_mode = 0;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7935,10 +7930,8 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
if ((failed_bio->bi_vcnt > 1)
|| (failed_bio->bi_io_vec->bv_len
- > BTRFS_I(inode)->root->sectorsize))
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
- else
- read_mode = READ_SYNC;
+ > btrfs_inode_sectorsize(inode)))
+ read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
isector >>= inode->i_sb->s_blocksize_bits;
@@ -7982,7 +7975,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
inode = bio->bi_io_vec->bv_page->mapping->host;
- ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
done->uptodate = 1;
bio_for_each_segment_all(bvec, bio, i)
@@ -8006,7 +7999,7 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
int ret;
fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = BTRFS_I(inode)->root->sectorsize;
+ sectorsize = fs_info->sectorsize;
start = io_bio->logical;
done.inode = inode;
@@ -8065,7 +8058,7 @@ static void btrfs_retry_endio(struct bio *bio)
ASSERT(bio->bi_vcnt == 1);
inode = bio->bi_io_vec->bv_page->mapping->host;
- ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
bio_for_each_segment_all(bvec, bio, i) {
ret = __readpage_endio_check(done->inode, io_bio, i,
@@ -8100,7 +8093,7 @@ static int __btrfs_subio_endio_read(struct inode *inode,
int ret;
fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = BTRFS_I(inode)->root->sectorsize;
+ sectorsize = fs_info->sectorsize;
err = 0;
start = io_bio->logical;
@@ -8197,7 +8190,7 @@ static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
const u64 bytes,
const int uptodate)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
@@ -8213,8 +8206,7 @@ again:
btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(root->fs_info->endio_write_workers,
- &ordered->work);
+ btrfs_queue_work(fs_info->endio_write_workers, &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -8249,8 +8241,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
unsigned long bio_flags, u64 offset)
{
int ret;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ ret = btrfs_csum_one_bio(inode, bio, offset, 1);
BUG_ON(ret); /* -ENOMEM */
return 0;
}
@@ -8304,8 +8295,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
return bio;
}
-static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
- struct inode *inode,
+static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
struct btrfs_dio_private *dip,
struct bio *bio,
u64 file_offset)
@@ -8320,7 +8310,7 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
* contention.
*/
if (dip->logical_offset == file_offset) {
- ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+ ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
file_offset);
if (ret)
return ret;
@@ -8340,9 +8330,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
u64 file_offset, int skip_sum,
int async_submit)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
bool write = bio_op(bio) == REQ_OP_WRITE;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
if (async_submit)
@@ -8351,8 +8341,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
bio_get(bio);
if (!write) {
- ret = btrfs_bio_wq_end_io(root->fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
goto err;
}
@@ -8361,27 +8350,27 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(root->fs_info,
- inode, bio, 0, 0, file_offset,
- __btrfs_submit_bio_start_direct_io,
- __btrfs_submit_bio_done);
+ ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0,
+ file_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
goto err;
} else if (write) {
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
if (ret)
goto err;
} else {
- ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+ ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
file_offset);
if (ret)
goto err;
}
map:
- ret = btrfs_map_bio(root, bio, 0, async_submit);
+ ret = btrfs_map_bio(fs_info, bio, 0, async_submit);
err:
bio_put(bio);
return ret;
@@ -8391,23 +8380,24 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
int skip_sum)
{
struct inode *inode = dip->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
- struct bio_vec *bvec = orig_bio->bi_io_vec;
+ struct bio_vec *bvec;
u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
u64 submit_len = 0;
u64 map_length;
- u32 blocksize = root->sectorsize;
+ u32 blocksize = fs_info->sectorsize;
int async_submit = 0;
int nr_sectors;
int ret;
- int i;
+ int i, j;
map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
- start_sector << 9, &map_length, NULL, 0);
+ ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
+ &map_length, NULL, 0);
if (ret)
return -EIO;
@@ -8427,14 +8417,14 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
if (!bio)
return -ENOMEM;
- bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
+ bio->bi_opf = orig_bio->bi_opf;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
atomic_inc(&dip->pending_bios);
- while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
+ bio_for_each_segment_all(bvec, orig_bio, j) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
i = 0;
next_block:
if (unlikely(map_length < submit_len + blocksize ||
@@ -8465,14 +8455,13 @@ next_block:
start_sector, GFP_NOFS);
if (!bio)
goto out_err;
- bio_set_op_attrs(bio, bio_op(orig_bio),
- bio_flags(orig_bio));
+ bio->bi_opf = orig_bio->bi_opf;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
map_length = orig_bio->bi_iter.bi_size;
- ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
+ ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
start_sector << 9,
&map_length, NULL, 0);
if (ret) {
@@ -8487,7 +8476,6 @@ next_block:
i++;
goto next_block;
}
- bvec++;
}
}
@@ -8619,12 +8607,13 @@ free_ordered:
kfree(dip);
}
-static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
- const struct iov_iter *iter, loff_t offset)
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ struct kiocb *iocb,
+ const struct iov_iter *iter, loff_t offset)
{
int seg;
int i;
- unsigned blocksize_mask = root->sectorsize - 1;
+ unsigned int blocksize_mask = fs_info->sectorsize - 1;
ssize_t retval = -EINVAL;
if (offset & blocksize_mask)
@@ -8656,7 +8645,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_data dio_data = { 0 };
loff_t offset = iocb->ki_pos;
size_t count = 0;
@@ -8665,7 +8654,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
+ if (check_direct_IO(fs_info, iocb, iter, offset))
return 0;
inode_dio_begin(inode);
@@ -8705,10 +8694,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* do the accounting properly if we go over the number we
* originally calculated. Abuse current->journal_info for this.
*/
- dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.reserve = round_up(count,
+ fs_info->sectorsize);
dio_data.unsubmitted_oe_range_start = (u64)offset;
dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
+ down_read(&BTRFS_I(inode)->dio_sem);
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_end(inode);
@@ -8717,10 +8708,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
ret = __blockdev_direct_IO(iocb, inode,
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ fs_info->fs_devices->latest_bdev,
iter, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (iov_iter_rw(iter) == WRITE) {
+ up_read(&BTRFS_I(inode)->dio_sem);
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
@@ -8976,7 +8968,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
@@ -9051,7 +9043,8 @@ again:
}
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
- reserved_space = round_up(size - page_start, root->sectorsize);
+ reserved_space = round_up(size - page_start,
+ fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
spin_lock(&BTRFS_I(inode)->lock);
@@ -9100,7 +9093,7 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_trans = fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
@@ -9121,13 +9114,14 @@ out_noreserve:
static int btrfs_truncate(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
int ret = 0;
int err = 0;
struct btrfs_trans_handle *trans;
- u64 mask = root->sectorsize - 1;
- u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 mask = fs_info->sectorsize - 1;
+ u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
(u64)-1);
@@ -9170,7 +9164,7 @@ static int btrfs_truncate(struct inode *inode)
* 3) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
@@ -9187,7 +9181,7 @@ static int btrfs_truncate(struct inode *inode)
}
/* Migrate the slack space for the truncate to our reserve */
- ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, 0);
BUG_ON(ret);
@@ -9210,15 +9204,15 @@ static int btrfs_truncate(struct inode *inode)
break;
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
err = ret;
break;
}
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
@@ -9227,7 +9221,8 @@ static int btrfs_truncate(struct inode *inode)
break;
}
- ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ btrfs_block_rsv_release(fs_info, rsv, -1);
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
@@ -9241,16 +9236,16 @@ static int btrfs_truncate(struct inode *inode)
}
if (trans) {
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
if (ret && !err)
err = ret;
- ret = btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ ret = btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
}
out:
- btrfs_free_block_rsv(root, rsv);
+ btrfs_free_block_rsv(fs_info, rsv);
if (ret && !err)
err = ret;
@@ -9366,6 +9361,7 @@ static void btrfs_i_callback(struct rcu_head *head)
void btrfs_destroy_inode(struct inode *inode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9387,8 +9383,8 @@ void btrfs_destroy_inode(struct inode *inode)
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) {
- btrfs_info(root->fs_info, "inode %llu still on the orphan list",
- btrfs_ino(inode));
+ btrfs_info(fs_info, "inode %llu still on the orphan list",
+ btrfs_ino(inode));
atomic_dec(&root->orphan_inodes);
}
@@ -9397,7 +9393,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->len);
btrfs_remove_ordered_extent(inode, ordered);
@@ -9509,6 +9505,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_dir,
struct dentry *new_dentry)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
@@ -9531,9 +9528,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&root->fs_info->subvol_sem);
+ down_read(&fs_info->subvol_sem);
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&dest->fs_info->subvol_sem);
+ down_read(&fs_info->subvol_sem);
/*
* We want to reserve the absolute worst case amount of items. So if
@@ -9566,7 +9563,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* Reference for the source. */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
} else {
btrfs_pin_log_trans(root);
root_log_pinned = true;
@@ -9582,7 +9579,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* And now for the dest. */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(dest->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
} else {
btrfs_pin_log_trans(dest);
dest_log_pinned = true;
@@ -9696,12 +9693,12 @@ out_fail:
* allow the tasks to sync it.
*/
if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
- btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
- btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ if (btrfs_inode_in_log(old_dir, fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, fs_info->generation) ||
(new_inode &&
- btrfs_inode_in_log(new_inode, root->fs_info->generation)))
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_inode_in_log(new_inode, fs_info->generation)))
+ btrfs_set_log_full_commit(fs_info, trans);
if (root_log_pinned) {
btrfs_end_log_trans(root);
@@ -9712,12 +9709,12 @@ out_fail:
dest_log_pinned = false;
}
}
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&dest->fs_info->subvol_sem);
+ up_read(&fs_info->subvol_sem);
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&root->fs_info->subvol_sem);
+ up_read(&fs_info->subvol_sem);
return ret;
}
@@ -9777,6 +9774,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -9833,7 +9831,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* close the racy window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&root->fs_info->subvol_sem);
+ down_read(&fs_info->subvol_sem);
/*
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
@@ -9864,7 +9862,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
} else {
btrfs_pin_log_trans(root);
log_pinned = true;
@@ -9971,20 +9969,20 @@ out_fail:
* allow the tasks to sync it.
*/
if (ret && log_pinned) {
- if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
- btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
- btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+ if (btrfs_inode_in_log(old_dir, fs_info->generation) ||
+ btrfs_inode_in_log(new_dir, fs_info->generation) ||
+ btrfs_inode_in_log(old_inode, fs_info->generation) ||
(new_inode &&
- btrfs_inode_in_log(new_inode, root->fs_info->generation)))
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_inode_in_log(new_inode, fs_info->generation)))
+ btrfs_set_log_full_commit(fs_info, trans);
btrfs_end_log_trans(root);
log_pinned = false;
}
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&root->fs_info->subvol_sem);
+ up_read(&fs_info->subvol_sem);
return ret;
}
@@ -10119,9 +10117,10 @@ out:
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
ret = __start_delalloc_inodes(root, delay_iput, -1);
@@ -10132,14 +10131,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
*/
- atomic_inc(&root->fs_info->async_submit_draining);
- while (atomic_read(&root->fs_info->nr_async_submits) ||
- atomic_read(&root->fs_info->async_delalloc_pages)) {
- wait_event(root->fs_info->async_submit_wait,
- (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
- atomic_read(&root->fs_info->async_delalloc_pages) == 0));
- }
- atomic_dec(&root->fs_info->async_submit_draining);
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
+ }
+ atomic_dec(&fs_info->async_submit_draining);
return ret;
}
@@ -10202,6 +10201,7 @@ out:
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
@@ -10218,7 +10218,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
/*
@@ -10312,12 +10312,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
d_instantiate(dentry, inode);
out_unlock:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root);
+ btrfs_btree_balance_dirty(fs_info);
return err;
out_unlock_inode:
@@ -10331,6 +10331,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -10367,10 +10368,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
min_size, 0, *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
break;
}
- btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
@@ -10379,11 +10380,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
ins.offset, 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid,
+ btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
btrfs_abort_transaction(trans, ret);
if (own_trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
break;
}
@@ -10404,7 +10405,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
@@ -10443,12 +10444,12 @@ next:
if (ret) {
btrfs_abort_transaction(trans, ret);
if (own_trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
break;
}
if (own_trans)
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
}
if (cur_offset < end)
btrfs_free_reserved_data_space(inode, cur_offset,
@@ -10496,6 +10497,7 @@ static int btrfs_permission(struct inode *inode, int mask)
static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
@@ -10552,11 +10554,11 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
mark_inode_dirty(inode);
out:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (ret)
iput(inode);
- btrfs_balance_delayed_items(root);
- btrfs_btree_balance_dirty(root);
+ btrfs_balance_delayed_items(fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return ret;
out_inode:
@@ -10587,8 +10589,6 @@ static const struct inode_operations btrfs_dir_inode_operations = {
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
- .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
@@ -10668,7 +10668,6 @@ static const struct inode_operations btrfs_special_inode_operations = {
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7acbd2cf6192..21e51b0ba188 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -33,7 +33,6 @@
#include <linux/namei.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/bit_spinlock.h>
#include <linux/security.h>
@@ -216,6 +215,7 @@ static int check_flags(unsigned int flags)
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *ip = BTRFS_I(inode);
struct btrfs_root *root = ip->root;
struct btrfs_trans_handle *trans;
@@ -325,7 +325,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_COMPRESS;
ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
- if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
comp = "lzo";
else
comp = "zlib";
@@ -352,7 +352,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
out_drop:
if (ret) {
ip->flags = ip_oldflags;
@@ -374,7 +374,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
@@ -410,7 +411,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
- ret = btrfs_trim_fs(fs_info->tree_root, &range);
+ ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
return ret;
@@ -437,6 +438,7 @@ static noinline int create_subvol(struct inode *dir,
u64 *async_transid,
struct btrfs_qgroup_inherit *inherit)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item *root_item;
@@ -459,7 +461,7 @@ static noinline int create_subvol(struct inode *dir,
if (!root_item)
return -ENOMEM;
- ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
+ ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
@@ -485,14 +487,14 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(root, &block_rsv,
+ btrfs_subvolume_release_metadata(fs_info, &block_rsv,
qgroup_reserved);
goto fail_free;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit);
if (ret)
goto fail;
@@ -502,24 +504,22 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+ memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(leaf, leaf->start);
btrfs_set_header_generation(leaf, trans->transid);
btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(leaf, objectid);
- write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
- BTRFS_FSID_SIZE);
- write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(leaf),
- BTRFS_UUID_SIZE);
+ write_extent_buffer_fsid(leaf, fs_info->fsid);
+ write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
btrfs_mark_buffer_dirty(leaf);
inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
- btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
+ btrfs_set_stack_inode_nbytes(inode_item,
+ fs_info->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_flags(root_item, 0);
@@ -552,13 +552,13 @@ static noinline int create_subvol(struct inode *dir,
key.objectid = objectid;
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
- ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+ ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret)
goto fail;
key.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ new_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -599,14 +599,13 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ ret = btrfs_add_root_ref(trans, fs_info,
objectid, root->root_key.objectid,
btrfs_ino(dir), index, name, namelen);
BUG_ON(ret);
- ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
- objectid);
+ ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -614,15 +613,15 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+ btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
if (async_transid) {
*async_transid = trans->transid;
- err = btrfs_commit_transaction_async(trans, root, 1);
+ err = btrfs_commit_transaction_async(trans, 1);
if (err)
- err = btrfs_commit_transaction(trans, root);
+ err = btrfs_commit_transaction(trans);
} else {
- err = btrfs_commit_transaction(trans, root);
+ err = btrfs_commit_transaction(trans);
}
if (err && !ret)
ret = err;
@@ -662,6 +661,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
@@ -721,19 +721,17 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto fail;
}
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
if (async_transid) {
*async_transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans,
- root->fs_info->extent_root, 1);
+ ret = btrfs_commit_transaction_async(trans, 1);
if (ret)
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
} else {
- ret = btrfs_commit_transaction(trans,
- root->fs_info->extent_root);
+ ret = btrfs_commit_transaction(trans);
}
if (ret)
goto fail;
@@ -755,7 +753,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
fail:
- btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+ btrfs_subvolume_release_metadata(fs_info,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
dec_and_free:
@@ -836,13 +834,14 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(struct path *parent,
+static noinline int btrfs_mksubvol(const struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = d_inode(parent->dentry);
+ struct inode *dir = d_inode(parent->dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct dentry *dentry;
int error;
@@ -869,7 +868,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (error)
goto out_dput;
- down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ down_read(&fs_info->subvol_sem);
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
@@ -884,7 +883,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
- up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+ up_read(&fs_info->subvol_sem);
out_dput:
dput(dentry);
out_unlock:
@@ -1268,6 +1267,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct file_ra_state *ra = NULL;
unsigned long last_index;
@@ -1365,8 +1365,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (!(inode->i_sb->s_flags & MS_ACTIVE))
break;
- if (btrfs_defrag_cancelled(root->fs_info)) {
- btrfs_debug(root->fs_info, "defrag_file cancelled");
+ if (btrfs_defrag_cancelled(fs_info)) {
+ btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
@@ -1454,18 +1454,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
*/
- atomic_inc(&root->fs_info->async_submit_draining);
- while (atomic_read(&root->fs_info->nr_async_submits) ||
- atomic_read(&root->fs_info->async_delalloc_pages)) {
- wait_event(root->fs_info->async_submit_wait,
- (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
- atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+ atomic_inc(&fs_info->async_submit_draining);
+ while (atomic_read(&fs_info->nr_async_submits) ||
+ atomic_read(&fs_info->async_delalloc_pages)) {
+ wait_event(fs_info->async_submit_wait,
+ (atomic_read(&fs_info->nr_async_submits) == 0 &&
+ atomic_read(&fs_info->async_delalloc_pages) == 0));
}
- atomic_dec(&root->fs_info->async_submit_draining);
+ atomic_dec(&fs_info->async_submit_draining);
}
if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
}
ret = defrag_count;
@@ -1485,10 +1485,12 @@ out_ra:
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
@@ -1505,13 +1507,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
+ if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
- mutex_lock(&root->fs_info->volume_mutex);
+ mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -1533,19 +1534,19 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_free;
}
- btrfs_info(root->fs_info, "resizing devid %llu", devid);
+ btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!device) {
- btrfs_info(root->fs_info, "resizer unable to find device %llu",
- devid);
+ btrfs_info(fs_info, "resizer unable to find device %llu",
+ devid);
ret = -ENODEV;
goto out_free;
}
if (!device->writeable) {
- btrfs_info(root->fs_info,
+ btrfs_info(fs_info,
"resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
@@ -1599,11 +1600,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- new_size = div_u64(new_size, root->sectorsize);
- new_size *= root->sectorsize;
+ new_size = div_u64(new_size, fs_info->sectorsize);
+ new_size *= fs_info->sectorsize;
- btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
- rcu_str_deref(device->name), new_size);
+ btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
+ rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
@@ -1612,7 +1613,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
- btrfs_commit_transaction(trans, root);
+ btrfs_commit_transaction(trans);
} else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */
@@ -1620,8 +1621,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- mutex_unlock(&root->fs_info->volume_mutex);
- atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mutex_unlock(&fs_info->volume_mutex);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
mnt_drop_write_file(file);
return ret;
}
@@ -1774,6 +1775,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
@@ -1781,10 +1783,10 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
- down_read(&root->fs_info->subvol_sem);
+ down_read(&fs_info->subvol_sem);
if (btrfs_root_readonly(root))
flags |= BTRFS_SUBVOL_RDONLY;
- up_read(&root->fs_info->subvol_sem);
+ up_read(&fs_info->subvol_sem);
if (copy_to_user(arg, &flags, sizeof(flags)))
ret = -EFAULT;
@@ -1796,6 +1798,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
@@ -1829,7 +1832,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_drop_write;
}
- down_write(&root->fs_info->subvol_sem);
+ down_write(&fs_info->subvol_sem);
/* nothing to do */
if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
@@ -1851,9 +1854,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
spin_unlock(&root->root_item_lock);
} else {
spin_unlock(&root->root_item_lock);
- btrfs_warn(root->fs_info,
- "Attempt to set subvolume %llu read-write during send",
- root->root_key.objectid);
+ btrfs_warn(fs_info,
+ "Attempt to set subvolume %llu read-write during send",
+ root->root_key.objectid);
ret = -EPERM;
goto out_drop_sem;
}
@@ -1865,15 +1868,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_reset;
}
- ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
- btrfs_commit_transaction(trans, root);
+ btrfs_commit_transaction(trans);
out_reset:
if (ret)
btrfs_set_root_flags(&root->root_item, root_flags);
out_drop_sem:
- up_write(&root->fs_info->subvol_sem);
+ up_write(&fs_info->subvol_sem);
out_drop_write:
mnt_drop_write_file(file);
out:
@@ -1885,6 +1888,7 @@ out:
*/
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct btrfs_key key;
@@ -1896,14 +1900,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
return -ENOMEM;
/* Make sure this root isn't set as the default subvol */
- dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
dir_id, "default", 7, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
ret = -EPERM;
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
goto out;
@@ -1915,8 +1919,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
- &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0);
@@ -2087,10 +2090,10 @@ static noinline int search_ioctl(struct inode *inode,
size_t *buf_size,
char __user *ubuf)
{
+ struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
@@ -2353,6 +2356,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
struct dentry *parent = file->f_path.dentry;
+ struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
struct dentry *dentry;
struct inode *dir = d_inode(parent);
struct inode *inode;
@@ -2418,7 +2422,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* rmdir(2).
*/
err = -EPERM;
- if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
+ if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
/*
@@ -2462,14 +2466,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
spin_unlock(&dest->root_item_lock);
} else {
spin_unlock(&dest->root_item_lock);
- btrfs_warn(root->fs_info,
- "Attempt to delete subvolume %llu during send",
- dest->root_key.objectid);
+ btrfs_warn(fs_info,
+ "Attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
err = -EPERM;
goto out_unlock_inode;
}
- down_write(&root->fs_info->subvol_sem);
+ down_write(&fs_info->subvol_sem);
err = may_destroy_subvol(dest);
if (err)
@@ -2514,7 +2518,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
- root->fs_info->tree_root,
+ fs_info->tree_root,
dest->root_key.objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2523,8 +2527,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
}
- ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
- dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
@@ -2532,7 +2536,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_end_trans;
}
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
- ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+ ret = btrfs_uuid_tree_rem(trans, fs_info,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
dest->root_key.objectid);
@@ -2546,14 +2550,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
if (ret && !err)
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+ btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved);
out_up_write:
- up_write(&root->fs_info->subvol_sem);
+ up_write(&fs_info->subvol_sem);
if (err) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
@@ -2655,7 +2659,7 @@ out:
return ret;
}
-static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -2663,12 +2667,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
+ if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- }
- mutex_lock(&root->fs_info->volume_mutex);
+ mutex_lock(&fs_info->volume_mutex);
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
@@ -2676,21 +2678,22 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_init_new_device(root, vol_args->name);
+ ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
- btrfs_info(root->fs_info, "disk added %s",vol_args->name);
+ btrfs_info(fs_info, "disk added %s", vol_args->name);
kfree(vol_args);
out:
- mutex_unlock(&root->fs_info->volume_mutex);
- atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mutex_unlock(&fs_info->volume_mutex);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
return ret;
}
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
@@ -2711,28 +2714,27 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
return -EOPNOTSUPP;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
+ if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
- mutex_lock(&root->fs_info->volume_mutex);
+ mutex_lock(&fs_info->volume_mutex);
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
- ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
} else {
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- ret = btrfs_rm_device(root, vol_args->name, 0);
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- mutex_unlock(&root->fs_info->volume_mutex);
- atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mutex_unlock(&fs_info->volume_mutex);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
- btrfs_info(root->fs_info, "device deleted: id %llu",
+ btrfs_info(fs_info, "device deleted: id %llu",
vol_args->devid);
else
- btrfs_info(root->fs_info, "device deleted: %s",
+ btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
out:
@@ -2744,7 +2746,8 @@ err_drop:
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
int ret;
@@ -2755,8 +2758,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
+ if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -2768,26 +2770,27 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&root->fs_info->volume_mutex);
- ret = btrfs_rm_device(root, vol_args->name, 0);
- mutex_unlock(&root->fs_info->volume_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ mutex_unlock(&fs_info->volume_mutex);
if (!ret)
- btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
out_drop_write:
mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int ret = 0;
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
@@ -2796,7 +2799,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
mutex_lock(&fs_devices->device_list_mutex);
fi_args->num_devices = fs_devices->num_devices;
- memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+ memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
@@ -2804,9 +2807,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
}
mutex_unlock(&fs_devices->device_list_mutex);
- fi_args->nodesize = root->fs_info->super_copy->nodesize;
- fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
- fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+ fi_args->nodesize = fs_info->super_copy->nodesize;
+ fi_args->sectorsize = fs_info->super_copy->sectorsize;
+ fi_args->clone_alignment = fs_info->super_copy->sectorsize;
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -2815,11 +2818,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
return ret;
}
-static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int ret = 0;
char *s_uuid = NULL;
@@ -2831,7 +2835,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
s_uuid = di_args->uuid;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
if (!dev) {
ret = -ENODEV;
@@ -3305,10 +3309,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
out:
return ret;
}
@@ -3406,9 +3410,10 @@ static int clone_copy_inline_extent(struct inode *src,
const u64 size,
char *inline_data)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
struct btrfs_root *root = BTRFS_I(dst)->root;
const u64 aligned_end = ALIGN(new_key->offset + datal,
- root->sectorsize);
+ fs_info->sectorsize);
int ret;
struct btrfs_key key;
@@ -3529,6 +3534,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
const u64 destoff, int no_time_update)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
struct extent_buffer *leaf;
@@ -3542,9 +3548,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!buf) {
- buf = vmalloc(root->nodesize);
+ buf = vmalloc(fs_info->nodesize);
if (!buf)
return ret;
}
@@ -3707,7 +3713,7 @@ process_slot:
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
@@ -3715,7 +3721,7 @@ process_slot:
&new_key, size);
if (ret) {
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
@@ -3739,7 +3745,8 @@ process_slot:
if (disko) {
inode_add_bytes(inode, datal);
- ret = btrfs_inc_extent_ref(trans, root,
+ ret = btrfs_inc_extent_ref(trans,
+ fs_info,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
@@ -3747,8 +3754,7 @@ process_slot:
if (ret) {
btrfs_abort_transaction(trans,
ret);
- btrfs_end_transaction(trans,
- root);
+ btrfs_end_transaction(trans);
goto out;
}
@@ -3767,7 +3773,7 @@ process_slot:
if (comp && (skip || trim)) {
ret = -EINVAL;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
size -= skip + trim;
@@ -3783,7 +3789,7 @@ process_slot:
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
leaf = path->nodes[0];
@@ -3802,7 +3808,7 @@ process_slot:
btrfs_release_path(path);
last_dest_end = ALIGN(new_key.offset + datal,
- root->sectorsize);
+ fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
destoff, olen,
@@ -3843,7 +3849,7 @@ process_slot:
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
clone_update_extent_map(inode, trans, NULL, last_dest_end,
@@ -3863,10 +3869,11 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
{
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
u64 len = olen;
- u64 bs = root->fs_info->sb->s_blocksize;
+ u64 bs = fs_info->sb->s_blocksize;
int same_inode = src == inode;
/*
@@ -3980,18 +3987,6 @@ out_unlock:
return ret;
}
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- ssize_t ret;
-
- ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
- if (ret == 0)
- ret = len;
- return ret;
-}
-
int btrfs_clone_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, u64 len)
{
@@ -4007,6 +4002,7 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off,
static long btrfs_ioctl_trans_start(struct file *file)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret;
@@ -4027,7 +4023,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
if (ret)
goto out;
- atomic_inc(&root->fs_info->open_ioctl_trans);
+ atomic_inc(&fs_info->open_ioctl_trans);
ret = -ENOMEM;
trans = btrfs_start_ioctl_transaction(root);
@@ -4038,7 +4034,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
return 0;
out_drop:
- atomic_dec(&root->fs_info->open_ioctl_trans);
+ atomic_dec(&fs_info->open_ioctl_trans);
mnt_drop_write_file(file);
out:
return ret;
@@ -4047,6 +4043,7 @@ out:
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
@@ -4077,7 +4074,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ new_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
goto out;
@@ -4097,13 +4094,13 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
goto out;
}
- dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
- di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
+ di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
btrfs_free_path(path);
- btrfs_end_transaction(trans, root);
- btrfs_err(new_root->fs_info,
+ btrfs_end_transaction(trans);
+ btrfs_err(fs_info,
"Umm, you don't have the default diritem, this isn't going to work");
ret = -ENOENT;
goto out;
@@ -4114,8 +4111,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
- btrfs_end_transaction(trans, root);
+ btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
+ btrfs_end_transaction(trans);
out:
mnt_drop_write_file(file);
return ret;
@@ -4137,7 +4134,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
}
}
-static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
struct btrfs_ioctl_space_args space_args;
struct btrfs_ioctl_space_info space;
@@ -4165,7 +4163,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
info = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list_for_each_entry_rcu(tmp, &fs_info->space_info,
list) {
if (tmp->flags == types[i]) {
info = tmp;
@@ -4221,7 +4219,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
info = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+ list_for_each_entry_rcu(tmp, &fs_info->space_info,
list) {
if (tmp->flags == types[i]) {
info = tmp;
@@ -4252,7 +4250,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
* Add global block reserve
*/
if (slot_count) {
- struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
spin_lock(&block_rsv->lock);
space.total_bytes = block_rsv->size;
@@ -4294,7 +4292,7 @@ long btrfs_ioctl_trans_end(struct file *file)
return -EINVAL;
file->private_data = NULL;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
atomic_dec(&root->fs_info->open_ioctl_trans);
@@ -4319,9 +4317,9 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, root, 0);
+ ret = btrfs_commit_transaction_async(trans, 0);
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
out:
@@ -4331,7 +4329,7 @@ out:
return 0;
}
-static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
void __user *argp)
{
u64 transid;
@@ -4342,12 +4340,12 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
} else {
transid = 0; /* current trans */
}
- return btrfs_wait_for_commit(root, transid);
+ return btrfs_wait_for_commit(fs_info, transid);
}
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
struct btrfs_ioctl_scrub_args *sa;
int ret;
@@ -4364,7 +4362,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
goto out;
}
- ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+ ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
@@ -4378,15 +4376,15 @@ out:
return ret;
}
-static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_scrub_cancel(root->fs_info);
+ return btrfs_scrub_cancel(fs_info);
}
-static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_scrub_args *sa;
@@ -4399,7 +4397,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+ ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
@@ -4408,7 +4406,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
return ret;
}
-static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_get_dev_stats *sa;
@@ -4423,7 +4421,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
return -EPERM;
}
- ret = btrfs_get_dev_stats(root, sa);
+ ret = btrfs_get_dev_stats(fs_info, sa);
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
@@ -4432,7 +4430,8 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
return ret;
}
-static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
struct btrfs_ioctl_dev_replace_args *p;
int ret;
@@ -4446,27 +4445,25 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
switch (p->cmd) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
- if (root->fs_info->sb->s_flags & MS_RDONLY) {
+ if (fs_info->sb->s_flags & MS_RDONLY) {
ret = -EROFS;
goto out;
}
if (atomic_xchg(
- &root->fs_info->mutually_exclusive_operation_running,
- 1)) {
+ &fs_info->mutually_exclusive_operation_running, 1)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
- ret = btrfs_dev_replace_by_ioctl(root, p);
+ ret = btrfs_dev_replace_by_ioctl(fs_info, p);
atomic_set(
- &root->fs_info->mutually_exclusive_operation_running,
- 0);
+ &fs_info->mutually_exclusive_operation_running, 0);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
- btrfs_dev_replace_status(root->fs_info, p);
+ btrfs_dev_replace_status(fs_info, p);
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
- ret = btrfs_dev_replace_cancel(root->fs_info, p);
+ ret = btrfs_dev_replace_cancel(fs_info, p);
break;
default:
ret = -EINVAL;
@@ -4559,7 +4556,7 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
return 0;
}
-static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
void __user *arg)
{
int ret = 0;
@@ -4572,11 +4569,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
return -EPERM;
loi = memdup_user(arg, sizeof(*loi));
- if (IS_ERR(loi)) {
- ret = PTR_ERR(loi);
- loi = NULL;
- goto out;
- }
+ if (IS_ERR(loi))
+ return PTR_ERR(loi);
path = btrfs_alloc_path();
if (!path) {
@@ -4592,7 +4586,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
build_ino_list, inodes);
if (ret == -EINVAL)
ret = -ENOENT;
@@ -4788,25 +4782,24 @@ out:
return ret;
}
-static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case BTRFS_BALANCE_CTL_PAUSE:
- return btrfs_pause_balance(root->fs_info);
+ return btrfs_pause_balance(fs_info);
case BTRFS_BALANCE_CTL_CANCEL:
- return btrfs_cancel_balance(root->fs_info);
+ return btrfs_cancel_balance(fs_info);
}
return -EINVAL;
}
-static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
int ret = 0;
@@ -4838,7 +4831,8 @@ out:
static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_ctl_args *sa;
struct btrfs_trans_handle *trans = NULL;
int ret;
@@ -4857,8 +4851,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
goto drop_write;
}
- down_write(&root->fs_info->subvol_sem);
- trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+ down_write(&fs_info->subvol_sem);
+ trans = btrfs_start_transaction(fs_info->tree_root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
@@ -4866,22 +4860,22 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
- ret = btrfs_quota_enable(trans, root->fs_info);
+ ret = btrfs_quota_enable(trans, fs_info);
break;
case BTRFS_QUOTA_CTL_DISABLE:
- ret = btrfs_quota_disable(trans, root->fs_info);
+ ret = btrfs_quota_disable(trans, fs_info);
break;
default:
ret = -EINVAL;
break;
}
- err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+ err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
out:
kfree(sa);
- up_write(&root->fs_info->subvol_sem);
+ up_write(&fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
@@ -4889,7 +4883,9 @@ drop_write:
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -4916,19 +4912,19 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* FIXME: check if the IDs really exist */
if (sa->assign) {
- ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+ ret = btrfs_add_qgroup_relation(trans, fs_info,
sa->src, sa->dst);
} else {
- ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+ ret = btrfs_del_qgroup_relation(trans, fs_info,
sa->src, sa->dst);
}
/* update qgroup status and info */
- err = btrfs_run_qgroups(trans, root->fs_info);
+ err = btrfs_run_qgroups(trans, fs_info);
if (err < 0)
- btrfs_handle_fs_error(root->fs_info, err,
- "failed to update qgroup status and info");
- err = btrfs_end_transaction(trans, root);
+ btrfs_handle_fs_error(fs_info, err,
+ "failed to update qgroup status and info");
+ err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
@@ -4941,7 +4937,9 @@ drop_write:
static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_create_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -4973,12 +4971,12 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
/* FIXME: check if the IDs really exist */
if (sa->create) {
- ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
+ ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
} else {
- ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+ ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
}
- err = btrfs_end_transaction(trans, root);
+ err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
@@ -4991,7 +4989,9 @@ drop_write:
static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_limit_args *sa;
struct btrfs_trans_handle *trans;
int ret;
@@ -5024,9 +5024,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
}
/* FIXME: check if the IDs really exist */
- ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+ ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
- err = btrfs_end_transaction(trans, root);
+ err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
@@ -5039,7 +5039,8 @@ drop_write:
static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret;
@@ -5061,7 +5062,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
goto out;
}
- ret = btrfs_qgroup_rescan(root->fs_info);
+ ret = btrfs_qgroup_rescan(fs_info);
out:
kfree(qsa);
@@ -5072,7 +5073,8 @@ drop_write:
static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret = 0;
@@ -5083,9 +5085,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
if (!qsa)
return -ENOMEM;
- if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
qsa->flags = 1;
- qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+ qsa->progress = fs_info->qgroup_rescan_progress.objectid;
}
if (copy_to_user(arg, qsa, sizeof(*qsa)))
@@ -5097,18 +5099,20 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_qgroup_wait_for_completion(root->fs_info, true);
+ return btrfs_qgroup_wait_for_completion(fs_info, true);
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
@@ -5123,7 +5127,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
if (ret < 0)
return ret;
- down_write(&root->fs_info->subvol_sem);
+ down_write(&fs_info->subvol_sem);
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
@@ -5154,8 +5158,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
!btrfs_is_empty_uuid(root_item->received_uuid))
- btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
- root_item->received_uuid,
+ btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
@@ -5166,15 +5169,14 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
- ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
if (ret < 0) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
goto out;
}
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
- ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- sa->uuid,
+ ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
@@ -5182,14 +5184,14 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
- up_write(&root->fs_info->subvol_sem);
+ up_write(&fs_info->subvol_sem);
mnt_drop_write_file(file);
return ret;
}
@@ -5203,11 +5205,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
int ret = 0;
args32 = memdup_user(arg, sizeof(*args32));
- if (IS_ERR(args32)) {
- ret = PTR_ERR(args32);
- args32 = NULL;
- goto out;
- }
+ if (IS_ERR(args32))
+ return PTR_ERR(args32);
args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
@@ -5255,11 +5254,8 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
int ret = 0;
sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa)) {
- ret = PTR_ERR(sa);
- sa = NULL;
- goto out;
- }
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
ret = _btrfs_ioctl_set_received_subvol(file, sa);
@@ -5277,20 +5273,22 @@ out:
static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
size_t len;
int ret;
char label[BTRFS_LABEL_SIZE];
- spin_lock(&root->fs_info->super_lock);
- memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
- spin_unlock(&root->fs_info->super_lock);
+ spin_lock(&fs_info->super_lock);
+ memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+ spin_unlock(&fs_info->super_lock);
len = strnlen(label, BTRFS_LABEL_SIZE);
if (len == BTRFS_LABEL_SIZE) {
- btrfs_warn(root->fs_info,
- "label is too long, return the first %zu bytes", --len);
+ btrfs_warn(fs_info,
+ "label is too long, return the first %zu bytes",
+ --len);
}
ret = copy_to_user(arg, label, len);
@@ -5300,8 +5298,10 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_trans_handle *trans;
char label[BTRFS_LABEL_SIZE];
int ret;
@@ -5313,7 +5313,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
return -EFAULT;
if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"unable to set label with more than %d bytes",
BTRFS_LABEL_SIZE - 1);
return -EINVAL;
@@ -5329,10 +5329,10 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
goto out_unlock;
}
- spin_lock(&root->fs_info->super_lock);
+ spin_lock(&fs_info->super_lock);
strcpy(super_block->label, label);
- spin_unlock(&root->fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, root);
+ spin_unlock(&fs_info->super_lock);
+ ret = btrfs_commit_transaction(trans);
out_unlock:
mnt_drop_write_file(file);
@@ -5360,8 +5360,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg)
static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags features;
features.compat_flags = btrfs_super_compat_flags(super_block);
@@ -5374,7 +5375,7 @@ static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
return 0;
}
-static int check_feature_bits(struct btrfs_root *root,
+static int check_feature_bits(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
@@ -5389,14 +5390,14 @@ static int check_feature_bits(struct btrfs_root *root,
if (unsupported) {
names = btrfs_printable_features(set, unsupported);
if (names) {
- btrfs_warn(root->fs_info,
- "this kernel does not support the %s feature bit%s",
- names, strchr(names, ',') ? "s" : "");
+ btrfs_warn(fs_info,
+ "this kernel does not support the %s feature bit%s",
+ names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
- btrfs_warn(root->fs_info,
- "this kernel does not support %s bits 0x%llx",
- type, unsupported);
+ btrfs_warn(fs_info,
+ "this kernel does not support %s bits 0x%llx",
+ type, unsupported);
return -EOPNOTSUPP;
}
@@ -5404,14 +5405,14 @@ static int check_feature_bits(struct btrfs_root *root,
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
if (names) {
- btrfs_warn(root->fs_info,
- "can't set the %s feature bit%s while mounted",
- names, strchr(names, ',') ? "s" : "");
+ btrfs_warn(fs_info,
+ "can't set the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
- btrfs_warn(root->fs_info,
- "can't set %s bits 0x%llx while mounted",
- type, disallowed);
+ btrfs_warn(fs_info,
+ "can't set %s bits 0x%llx while mounted",
+ type, disallowed);
return -EPERM;
}
@@ -5419,30 +5420,32 @@ static int check_feature_bits(struct btrfs_root *root,
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
if (names) {
- btrfs_warn(root->fs_info,
- "can't clear the %s feature bit%s while mounted",
- names, strchr(names, ',') ? "s" : "");
+ btrfs_warn(fs_info,
+ "can't clear the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
- btrfs_warn(root->fs_info,
- "can't clear %s bits 0x%llx while mounted",
- type, disallowed);
+ btrfs_warn(fs_info,
+ "can't clear %s bits 0x%llx while mounted",
+ type, disallowed);
return -EPERM;
}
return 0;
}
-#define check_feature(root, change_mask, flags, mask_base) \
-check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \
+#define check_feature(fs_info, change_mask, flags, mask_base) \
+check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
BTRFS_FEATURE_ ## mask_base ## _SUPP, \
BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags flags[2];
struct btrfs_trans_handle *trans;
u64 newflags;
@@ -5459,17 +5462,17 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
!flags[0].incompat_flags)
return 0;
- ret = check_feature(root, flags[0].compat_flags,
+ ret = check_feature(fs_info, flags[0].compat_flags,
flags[1].compat_flags, COMPAT);
if (ret)
return ret;
- ret = check_feature(root, flags[0].compat_ro_flags,
+ ret = check_feature(fs_info, flags[0].compat_ro_flags,
flags[1].compat_ro_flags, COMPAT_RO);
if (ret)
return ret;
- ret = check_feature(root, flags[0].incompat_flags,
+ ret = check_feature(fs_info, flags[0].incompat_flags,
flags[1].incompat_flags, INCOMPAT);
if (ret)
return ret;
@@ -5484,7 +5487,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
goto out_drop_write;
}
- spin_lock(&root->fs_info->super_lock);
+ spin_lock(&fs_info->super_lock);
newflags = btrfs_super_compat_flags(super_block);
newflags |= flags[0].compat_flags & flags[1].compat_flags;
newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
@@ -5499,9 +5502,9 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
btrfs_set_super_incompat_flags(super_block, newflags);
- spin_unlock(&root->fs_info->super_lock);
+ spin_unlock(&fs_info->super_lock);
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
out_drop_write:
mnt_drop_write_file(file);
@@ -5511,7 +5514,9 @@ out_drop_write:
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
- struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
void __user *argp = (void __user *)arg;
switch (cmd) {
@@ -5546,15 +5551,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_RESIZE:
return btrfs_ioctl_resize(file, argp);
case BTRFS_IOC_ADD_DEV:
- return btrfs_ioctl_add_dev(root, argp);
+ return btrfs_ioctl_add_dev(fs_info, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(file, argp);
case BTRFS_IOC_RM_DEV_V2:
return btrfs_ioctl_rm_dev_v2(file, argp);
case BTRFS_IOC_FS_INFO:
- return btrfs_ioctl_fs_info(root, argp);
+ return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
- return btrfs_ioctl_dev_info(root, argp);
+ return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TRANS_START:
@@ -5570,40 +5575,40 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
- return btrfs_ioctl_logical_to_ino(root, argp);
+ return btrfs_ioctl_logical_to_ino(fs_info, argp);
case BTRFS_IOC_SPACE_INFO:
- return btrfs_ioctl_space_info(root, argp);
+ return btrfs_ioctl_space_info(fs_info, argp);
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
if (ret)
return ret;
- ret = btrfs_sync_fs(file_inode(file)->i_sb, 1);
+ ret = btrfs_sync_fs(inode->i_sb, 1);
/*
* The transaction thread may want to do more work,
* namely it pokes the cleaner kthread that will start
* processing uncleaned subvols.
*/
- wake_up_process(root->fs_info->transaction_kthread);
+ wake_up_process(fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
return btrfs_ioctl_start_sync(root, argp);
case BTRFS_IOC_WAIT_SYNC:
- return btrfs_ioctl_wait_sync(root, argp);
+ return btrfs_ioctl_wait_sync(fs_info, argp);
case BTRFS_IOC_SCRUB:
return btrfs_ioctl_scrub(file, argp);
case BTRFS_IOC_SCRUB_CANCEL:
- return btrfs_ioctl_scrub_cancel(root, argp);
+ return btrfs_ioctl_scrub_cancel(fs_info);
case BTRFS_IOC_SCRUB_PROGRESS:
- return btrfs_ioctl_scrub_progress(root, argp);
+ return btrfs_ioctl_scrub_progress(fs_info, argp);
case BTRFS_IOC_BALANCE_V2:
return btrfs_ioctl_balance(file, argp);
case BTRFS_IOC_BALANCE_CTL:
- return btrfs_ioctl_balance_ctl(root, arg);
+ return btrfs_ioctl_balance_ctl(fs_info, arg);
case BTRFS_IOC_BALANCE_PROGRESS:
- return btrfs_ioctl_balance_progress(root, argp);
+ return btrfs_ioctl_balance_progress(fs_info, argp);
case BTRFS_IOC_SET_RECEIVED_SUBVOL:
return btrfs_ioctl_set_received_subvol(file, argp);
#ifdef CONFIG_64BIT
@@ -5613,7 +5618,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SEND:
return btrfs_ioctl_send(file, argp);
case BTRFS_IOC_GET_DEV_STATS:
- return btrfs_ioctl_get_dev_stats(root, argp);
+ return btrfs_ioctl_get_dev_stats(fs_info, argp);
case BTRFS_IOC_QUOTA_CTL:
return btrfs_ioctl_quota_ctl(file, argp);
case BTRFS_IOC_QGROUP_ASSIGN:
@@ -5629,7 +5634,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
return btrfs_ioctl_quota_rescan_wait(file, argp);
case BTRFS_IOC_DEV_REPLACE:
- return btrfs_ioctl_dev_replace(root, argp);
+ return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_FSLABEL:
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
@@ -5648,6 +5653,10 @@ long btrfs_ioctl(struct file *file, unsigned int
#ifdef CONFIG_COMPAT
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ /*
+ * These all access 32-bit values anyway so no further
+ * handling is necessary.
+ */
switch (cmd) {
case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
@@ -5658,8 +5667,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
- default:
- return -ENOIOCTLCMD;
}
return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 48655da0f4ca..45d26980caf9 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -254,25 +254,21 @@ out:
return ret;
}
-static int lzo_decompress_biovec(struct list_head *ws,
+static int lzo_decompress_bio(struct list_head *ws,
struct page **pages_in,
u64 disk_start,
- struct bio_vec *bvec,
- int vcnt,
+ struct bio *orig_bio,
size_t srclen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
char *data_in;
unsigned long page_in_index = 0;
- unsigned long page_out_index = 0;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
unsigned long working_bytes;
- unsigned long pg_offset;
-
size_t in_len;
size_t out_len;
unsigned long in_offset;
@@ -292,7 +288,6 @@ static int lzo_decompress_biovec(struct list_head *ws,
in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
- pg_offset = 0;
while (tot_in < tot_len) {
in_len = read_compress_length(data_in + in_offset);
@@ -365,16 +360,14 @@ cont:
tot_out += out_len;
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- tot_out, disk_start,
- bvec, vcnt,
- &page_out_index, &pg_offset);
+ tot_out, disk_start, orig_bio);
if (ret2 == 0)
break;
}
done:
kunmap(pages_in[page_in_index]);
if (!ret)
- btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+ zero_fill_bio(orig_bio);
return ret;
}
@@ -438,6 +431,6 @@ const struct btrfs_compress_op btrfs_lzo_compress = {
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
- .decompress_biovec = lzo_decompress_biovec,
+ .decompress_bio = lzo_decompress_bio,
.decompress = lzo_decompress,
};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b2d1e95de7be..041c3326d109 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -186,6 +186,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio, int compress_type)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -234,11 +235,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&root->ordered_extents);
root->nr_ordered_extents++;
if (root->nr_ordered_extents == 1) {
- spin_lock(&root->fs_info->ordered_root_lock);
+ spin_lock(&fs_info->ordered_root_lock);
BUG_ON(!list_empty(&root->ordered_root));
- list_add_tail(&root->ordered_root,
- &root->fs_info->ordered_roots);
- spin_unlock(&root->fs_info->ordered_root_lock);
+ list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
}
spin_unlock(&root->ordered_extent_lock);
@@ -303,6 +303,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 *file_offset, u64 io_size, int uptodate)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
@@ -331,14 +332,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
entry->len);
*file_offset = dec_end;
if (dec_start > dec_end) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
- "bad ordering dec_start %llu end %llu", dec_start, dec_end);
+ btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
+ dec_start, dec_end);
}
to_dec = dec_end - dec_start;
if (to_dec > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
- "bad ordered accounting left %llu size %llu",
- entry->bytes_left, to_dec);
+ btrfs_crit(fs_info,
+ "bad ordered accounting left %llu size %llu",
+ entry->bytes_left, to_dec);
}
entry->bytes_left -= to_dec;
if (!uptodate)
@@ -588,6 +589,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
@@ -618,11 +620,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
* lock, so be nice and check if trans is set, but ASSERT() so
* if it isn't set a developer will notice.
*/
- spin_lock(&root->fs_info->trans_lock);
- trans = root->fs_info->running_transaction;
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
if (trans)
atomic_inc(&trans->use_count);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
ASSERT(trans);
if (trans) {
@@ -639,10 +641,10 @@ void btrfs_remove_ordered_extent(struct inode *inode,
trace_btrfs_ordered_extent_remove(inode, entry);
if (!root->nr_ordered_extents) {
- spin_lock(&root->fs_info->ordered_root_lock);
+ spin_lock(&fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root));
list_del_init(&root->ordered_root);
- spin_unlock(&root->fs_info->ordered_root_lock);
+ spin_unlock(&fs_info->ordered_root_lock);
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
@@ -664,6 +666,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
const u64 range_start, const u64 range_len)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(splice);
LIST_HEAD(skipped);
LIST_HEAD(works);
@@ -694,8 +697,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &ordered->flush_work);
+ btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
cond_resched();
spin_lock(&root->ordered_extent_lock);
@@ -978,7 +980,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
ordered->file_offset +
ordered->truncated_len);
} else {
- offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+ offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
}
disk_i_size = BTRFS_I(inode)->disk_i_size;
@@ -1087,7 +1089,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
unsigned long i;
- u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+ u32 sectorsize = btrfs_inode_sectorsize(inode);
int index = 0;
ordered = btrfs_lookup_ordered_extent(inode, offset);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 451507776ff5..5f2b0ca28705 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -145,10 +145,10 @@ struct btrfs_ordered_extent {
* calculates the total size you need to allocate for an ordered sum
* structure spanning 'bytes' in the file
*/
-static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
unsigned long bytes)
{
- int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+ int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 438575ea8d25..cdafbf92ef0c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -161,7 +161,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
}
}
-void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
{
int i;
u32 type, nr;
@@ -182,8 +182,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
nr = btrfs_header_nritems(l);
- btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
- btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
+ btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d",
+ btrfs_header_bytenr(l), nr,
+ btrfs_leaf_free_space(fs_info, l));
for (i = 0 ; i < nr ; i++) {
item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
@@ -314,7 +315,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
}
}
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c)
{
int i; u32 nr;
struct btrfs_key key;
@@ -325,13 +326,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
nr = btrfs_header_nritems(c);
level = btrfs_header_level(c);
if (level == 0) {
- btrfs_print_leaf(root, c);
+ btrfs_print_leaf(fs_info, c);
return;
}
- btrfs_info(root->fs_info,
+ btrfs_info(fs_info,
"node %llu level %d total ptrs %d free spc %u",
btrfs_header_bytenr(c), level, nr,
- (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+ (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
pr_info("\tkey %d (%llu %u %llu) block %llu\n",
@@ -339,7 +340,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
btrfs_node_blockptr(c, i));
}
for (i = 0; i < nr; i++) {
- struct extent_buffer *next = read_tree_block(root,
+ struct extent_buffer *next = read_tree_block(fs_info,
btrfs_node_blockptr(c, i),
btrfs_node_ptr_generation(c, i));
if (IS_ERR(next)) {
@@ -355,7 +356,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
if (btrfs_header_level(next) !=
level - 1)
BUG();
- btrfs_print_tree(root, next);
+ btrfs_print_tree(fs_info, next);
free_extent_buffer(next);
}
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 7faddfacc5bd..4f2e0ea0e95a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -18,6 +18,6 @@
#ifndef __PRINT_TREE_
#define __PRINT_TREE_
-void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
+void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c);
#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index cf0b444ac4f3..f2621e330954 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -301,6 +301,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
@@ -320,14 +321,14 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!value)
continue;
- num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
num_bytes, BTRFS_RESERVE_NO_FLUSH);
if (ret)
goto out;
ret = __btrfs_set_prop(trans, inode, h->xattr_name,
value, strlen(value), 0);
- btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes);
if (ret)
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 11f4fffe503e..662821f1252c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -131,8 +131,15 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-#define ptr_to_u64(x) ((u64)(uintptr_t)x)
-#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
+{
+ return (u64)(uintptr_t)qg;
+}
+
+static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
+{
+ return (struct btrfs_qgroup *)(uintptr_t)n->aux;
+}
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
@@ -1012,7 +1019,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
list_del(&quota_root->dirty_list);
btrfs_tree_lock(quota_root->node);
- clean_tree_block(trans, tree_root->fs_info, quota_root->node);
+ clean_tree_block(trans, fs_info, quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
@@ -1066,7 +1073,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
/* Get all of the parent groups that contain this qgroup */
list_for_each_entry(glist, &qgroup->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -1074,7 +1081,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
/* Iterate all of the parents and adjust their reference counts */
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = u64_to_ptr(unode->aux);
+ qgroup = unode_aux_to_qgroup(unode);
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
@@ -1087,7 +1094,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
/* Add any parents of the parents */
list_for_each_entry(glist, &qgroup->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -1185,7 +1192,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
}
spin_lock(&fs_info->qgroup_lock);
- ret = add_relation_rb(quota_root->fs_info, src, dst);
+ ret = add_relation_rb(fs_info, src, dst);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
@@ -1333,7 +1340,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
}
spin_lock(&fs_info->qgroup_lock);
- del_qgroup_rb(quota_root->fs_info, qgroupid);
+ del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -1450,7 +1457,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
{
@@ -1460,7 +1467,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
u64 bytenr = record->bytenr;
assert_spin_locked(&delayed_refs->lock);
- trace_btrfs_qgroup_insert_dirty_extent(fs_info, record);
+ trace_btrfs_qgroup_trace_extent(fs_info, record);
while (*p) {
parent_node = *p;
@@ -1479,7 +1486,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
gfp_t gfp_flag)
{
@@ -1502,14 +1509,228 @@ int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
record->old_roots = NULL;
spin_lock(&delayed_refs->lock);
- ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
- record);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
if (ret > 0)
kfree(record);
return 0;
}
+int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ int nr = btrfs_header_nritems(eb);
+ int i, extent_type, ret;
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *fi;
+ u64 bytenr, num_bytes;
+
+ /* We can be called directly from walk_up_proc() */
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ for (i = 0; i < nr; i++) {
+ btrfs_item_key_to_cpu(eb, &key, i);
+
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+ /* filter out non qgroup-accountable extents */
+ extent_type = btrfs_file_extent_type(eb, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+
+ bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (!bytenr)
+ continue;
+
+ num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = btrfs_qgroup_trace_extent(trans, fs_info, bytenr,
+ num_bytes, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+ struct btrfs_path *path, int root_level)
+{
+ int level = 0;
+ int nr, slot;
+ struct extent_buffer *eb;
+
+ if (root_level == 0)
+ return 1;
+
+ while (level <= root_level) {
+ eb = path->nodes[level];
+ nr = btrfs_header_nritems(eb);
+ path->slots[level]++;
+ slot = path->slots[level];
+ if (slot >= nr || level == 0) {
+ /*
+ * Don't free the root - we will detect this
+ * condition after our loop and return a
+ * positive value for caller to stop walking the tree.
+ */
+ if (level != root_level) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
+
+ free_extent_buffer(eb);
+ path->nodes[level] = NULL;
+ path->slots[level] = 0;
+ }
+ } else {
+ /*
+ * We have a valid slot to walk back down
+ * from. Stop here so caller can process these
+ * new nodes.
+ */
+ break;
+ }
+
+ level++;
+ }
+
+ eb = path->nodes[root_level];
+ if (path->slots[root_level] >= btrfs_header_nritems(eb))
+ return 1;
+
+ return 0;
+}
+
+int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen, int root_level)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
+ int level;
+ struct extent_buffer *eb = root_eb;
+ struct btrfs_path *path = NULL;
+
+ BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+ BUG_ON(root_eb == NULL);
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (!extent_buffer_uptodate(root_eb)) {
+ ret = btrfs_read_buffer(root_eb, root_gen);
+ if (ret)
+ goto out;
+ }
+
+ if (root_level == 0) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, root_eb);
+ goto out;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Walk down the tree. Missing extent blocks are filled in as
+ * we go. Metadata is accounted every time we read a new
+ * extent block.
+ *
+ * When we reach a leaf, we account for file extent items in it,
+ * walk back up the tree (adjusting slot pointers as we go)
+ * and restart the search process.
+ */
+ extent_buffer_get(root_eb); /* For path */
+ path->nodes[root_level] = root_eb;
+ path->slots[root_level] = 0;
+ path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+ level = root_level;
+ while (level >= 0) {
+ if (path->nodes[level] == NULL) {
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * We need to get child blockptr/gen from parent before
+ * we can read it.
+ */
+ eb = path->nodes[level + 1];
+ parent_slot = path->slots[level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ path->nodes[level] = eb;
+ path->slots[level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = btrfs_qgroup_trace_extent(trans, fs_info,
+ child_bytenr,
+ fs_info->nodesize,
+ GFP_NOFS);
+ if (ret)
+ goto out;
+ }
+
+ if (level == 0) {
+ ret = btrfs_qgroup_trace_leaf_items(trans,fs_info,
+ path->nodes[level]);
+ if (ret)
+ goto out;
+
+ /* Nonzero return here means we completed our search */
+ ret = adjust_slots_upwards(root, path, root_level);
+ if (ret)
+ break;
+
+ /* Restart search with new slots */
+ goto walk_down;
+ }
+
+ level--;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
#define UPDATE_NEW 0
#define UPDATE_OLD 1
/*
@@ -1535,30 +1756,30 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
continue;
ulist_reinit(tmp);
- ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
GFP_ATOMIC);
if (ret < 0)
return ret;
- ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
+ ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = u64_to_ptr(tmp_unode->aux);
+ qg = unode_aux_to_qgroup(tmp_unode);
if (update_old)
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group),
+ qgroup_to_aux(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group),
+ qgroup_to_aux(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
@@ -1619,7 +1840,7 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
while ((unode = ulist_next(qgroups, &uiter))) {
bool dirty = false;
- qg = u64_to_ptr(unode->aux);
+ qg = unode_aux_to_qgroup(unode);
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
@@ -1950,7 +2171,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
}
rcu_read_lock();
- level_size = srcroot->nodesize;
+ level_size = fs_info->nodesize;
rcu_read_unlock();
}
@@ -2034,8 +2255,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
if (*i_qgroups) {
- ret = add_relation_rb(quota_root->fs_info, objectid,
- *i_qgroups);
+ ret = add_relation_rb(fs_info, objectid, *i_qgroups);
if (ret)
goto unlock;
}
@@ -2125,7 +2345,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = u64_to_ptr(unode->aux);
+ qg = unode_aux_to_qgroup(unode);
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes >
@@ -2157,7 +2377,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
- qg = u64_to_ptr(unode->aux);
+ qg = unode_aux_to_qgroup(unode);
qg->reserved += num_bytes;
}
@@ -2202,7 +2422,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = u64_to_ptr(unode->aux);
+ qg = unode_aux_to_qgroup(unode);
qg->reserved -= num_bytes;
@@ -2302,7 +2522,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
- num_bytes = fs_info->extent_root->nodesize;
+ num_bytes = fs_info->nodesize;
else
num_bytes = found.offset;
@@ -2335,10 +2555,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
-
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -2356,9 +2572,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = qgroup_rescan_leaf(fs_info, path, trans);
}
if (err > 0)
- btrfs_commit_transaction(trans, fs_info->fs_root);
+ btrfs_commit_transaction(trans);
else
- btrfs_end_transaction(trans, fs_info->fs_root);
+ btrfs_end_transaction(trans);
}
out:
@@ -2393,7 +2609,7 @@ out:
err = ret;
btrfs_err(fs_info, "fail to update qgroup status: %d", err);
}
- btrfs_end_transaction(trans, fs_info->quota_root);
+ btrfs_end_transaction(trans);
if (btrfs_fs_closing(fs_info)) {
btrfs_info(fs_info, "qgroup scan paused");
@@ -2449,6 +2665,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
+ fs_info->qgroup_rescan_running = true;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2512,7 +2729,7 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
return PTR_ERR(trans);
}
- ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ ret = btrfs_commit_transaction(trans);
if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
return ret;
@@ -2677,13 +2894,14 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->objectid) || num_bytes == 0)
return 0;
- BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
ret = qgroup_reserve(root, num_bytes);
if (ret < 0)
return ret;
@@ -2693,9 +2911,10 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int reserved;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->objectid))
return;
@@ -2707,11 +2926,13 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
{
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->objectid))
return;
- BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
atomic_sub(num_bytes, &root->qgroup_meta_rsv);
qgroup_free(root, num_bytes);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1bc64c864b62..416ae8e1d23c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -23,6 +23,34 @@
#include "delayed-ref.h"
/*
+ * Btrfs qgroup overview
+ *
+ * Btrfs qgroup splits into 3 main part:
+ * 1) Reserve
+ * Reserve metadata/data space for incoming operations
+ * Affect how qgroup limit works
+ *
+ * 2) Trace
+ * Tell btrfs qgroup to trace dirty extents.
+ *
+ * Dirty extents including:
+ * - Newly allocated extents
+ * - Extents going to be deleted (in this trans)
+ * - Extents whose owner is going to be modified
+ *
+ * This is the main part affects whether qgroup numbers will stay
+ * consistent.
+ * Btrfs qgroup can trace clean extents and won't cause any problem,
+ * but it will consume extra CPU time, it should be avoided if possible.
+ *
+ * 3) Account
+ * Btrfs qgroup will updates its numbers, based on dirty extents traced
+ * in previous step.
+ *
+ * Normally at qgroup rescan and transaction commit time.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -65,8 +93,8 @@ struct btrfs_delayed_extent_op;
int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
/*
- * Insert one dirty extent record into @delayed_refs, informing qgroup to
- * account that extent at commit trans time.
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at commit trans time.
*
* No lock version, caller must acquire delayed ref lock and allocate memory.
*
@@ -74,14 +102,15 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
* Return >0 for existing record, caller can free @record safely.
* Error is not possible
*/
-int btrfs_qgroup_insert_dirty_extent_nolock(
+int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record);
/*
- * Insert one dirty extent record into @delayed_refs, informing qgroup to
- * account that extent at commit trans time.
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
*
* Better encapsulated version.
*
@@ -89,10 +118,33 @@ int btrfs_qgroup_insert_dirty_extent_nolock(
* Return <0 for error, like memory allocation failure or invalid parameter
* (NULL trans)
*/
-int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
gfp_t gfp_flag);
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
+int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb);
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
+int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *root_eb,
+ u64 root_gen, int root_level);
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d016d4a79864..d2a9a1ee5361 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -969,8 +969,9 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* allocation and initial setup for the btrfs_raid_bio. Not
* this does not allocate any pages for rbio->pages.
*/
-static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
- struct btrfs_bio *bbio, u64 stripe_len)
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
+ struct btrfs_bio *bbio,
+ u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
@@ -991,7 +992,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bbio = bbio;
- rbio->fs_info = root->fs_info;
+ rbio->fs_info = fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
@@ -1144,10 +1145,10 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
+ struct bio_vec *bvec;
u64 start;
unsigned long stripe_offset;
unsigned long page_index;
- struct page *p;
int i;
spin_lock_irq(&rbio->bio_list_lock);
@@ -1156,10 +1157,8 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
- for (i = 0; i < bio->bi_vcnt; i++) {
- p = bio->bi_io_vec[i].bv_page;
- rbio->bio_pages[page_index + i] = p;
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ rbio->bio_pages[page_index + i] = bvec->bv_page;
}
spin_unlock_irq(&rbio->bio_list_lock);
}
@@ -1433,13 +1432,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
*/
static void set_bio_pages_uptodate(struct bio *bio)
{
+ struct bio_vec *bvec;
int i;
- struct page *p;
- for (i = 0; i < bio->bi_vcnt; i++) {
- p = bio->bi_io_vec[i].bv_page;
- SetPageUptodate(p);
- }
+ bio_for_each_segment_all(bvec, bio, i)
+ SetPageUptodate(bvec->bv_page);
}
/*
@@ -1482,11 +1479,8 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper,
- rmw_work, NULL, NULL);
-
- btrfs_queue_work(rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL);
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
@@ -1494,8 +1488,7 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
btrfs_init_work(&rbio->work, btrfs_rmw_helper,
read_rebuild_work, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -1577,8 +1570,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_rmw_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- btrfs_bio_wq_end_io(rbio->fs_info, bio,
- BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -1743,7 +1735,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
@@ -1751,7 +1743,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(root, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
@@ -1760,7 +1752,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
- btrfs_bio_counter_inc_noblocked(root->fs_info);
+ btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
/*
@@ -1770,16 +1762,15 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
return ret;
}
- cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
- sizeof(*plug));
+ cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (!plug->info) {
- plug->info = root->fs_info;
+ plug->info = fs_info;
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
@@ -1787,7 +1778,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
} else {
ret = __raid56_parity_write(rbio);
if (ret)
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
}
return ret;
}
@@ -2102,8 +2093,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid_recover_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- btrfs_bio_wq_end_io(rbio->fs_info, bio,
- BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2123,14 +2113,14 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
- rbio = alloc_rbio(root, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
btrfs_put_bbio(bbio);
@@ -2143,7 +2133,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
- btrfs_warn(root->fs_info,
+ btrfs_warn(fs_info,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
__func__, (u64)bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bbio->map_type);
@@ -2154,7 +2144,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
}
if (generic_io) {
- btrfs_bio_counter_inc_noblocked(root->fs_info);
+ btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
btrfs_get_bbio(bbio);
@@ -2212,7 +2202,7 @@ static void read_rebuild_work(struct btrfs_work *work)
*/
struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
@@ -2220,7 +2210,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(root, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2239,7 +2229,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
}
/* Now we just support the sectorsize equals to page size */
- ASSERT(root->sectorsize == PAGE_SIZE);
+ ASSERT(fs_info->sectorsize == PAGE_SIZE);
ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
@@ -2621,8 +2611,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
bio->bi_end_io = raid56_parity_scrub_end_io;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- btrfs_bio_wq_end_io(rbio->fs_info, bio,
- BTRFS_WQ_ENDIO_RAID56);
+ btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
@@ -2650,8 +2639,7 @@ static void async_scrub_parity(struct btrfs_raid_bio *rbio)
btrfs_init_work(&rbio->work, btrfs_rmw_helper,
scrub_parity_work, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
@@ -2663,12 +2651,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 length)
{
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(root, bbio, length);
+ rbio = alloc_rbio(fs_info, bbio, length);
if (IS_ERR(rbio))
return NULL;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 8b694699d502..4ee4fe346838 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -42,24 +42,24 @@ static inline int nr_data_stripes(struct map_lookup *map)
struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 length);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 75bab76739be..e88bca87f5d2 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -107,18 +107,14 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
/* in case of err, eb might be NULL */
static void __readahead_hook(struct btrfs_fs_info *fs_info,
struct reada_extent *re, struct extent_buffer *eb,
- u64 start, int err)
+ int err)
{
- int level = 0;
int nritems;
int i;
u64 bytenr;
u64 generation;
struct list_head list;
- if (eb)
- level = btrfs_header_level(eb);
-
spin_lock(&re->lock);
/*
* just take the full list from the extent. afterwards we
@@ -143,7 +139,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
* trigger more readahead depending from the content, e.g.
* fetch the checksums for the extents in the leaf.
*/
- if (!level)
+ if (!btrfs_header_level(eb))
goto cleanup;
nritems = btrfs_header_nritems(eb);
@@ -213,12 +209,8 @@ cleanup:
return;
}
-/*
- * start is passed separately in case eb in NULL, which may be the case with
- * failed I/O
- */
int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, u64 start, int err)
+ struct extent_buffer *eb, int err)
{
int ret = 0;
struct reada_extent *re;
@@ -226,7 +218,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info,
/* find extent */
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree,
- start >> PAGE_SHIFT);
+ eb->start >> PAGE_SHIFT);
if (re)
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -235,7 +227,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info,
goto start_machine;
}
- __readahead_hook(fs_info, re, eb, start, err);
+ __readahead_hook(fs_info, re, eb, err);
reada_extent_put(fs_info, re); /* our ref */
start_machine:
@@ -311,14 +303,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
return zone;
}
-static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 logical,
struct btrfs_key *top)
{
int ret;
struct reada_extent *re = NULL;
struct reada_extent *re_exist = NULL;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
@@ -343,7 +334,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (!re)
return NULL;
- blocksize = root->nodesize;
+ blocksize = fs_info->nodesize;
re->logical = logical;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
@@ -354,13 +345,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
* map block
*/
length = blocksize;
- ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
- &bbio, 0);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &length, &bbio, 0);
if (ret || !bbio || length < blocksize)
goto error;
if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"readahead: more than %d copies not supported",
BTRFS_MAX_MIRRORS);
goto error;
@@ -401,7 +392,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
- BUG_ON(!re_exist);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -448,7 +438,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
/* ignore whether the entry was inserted */
radix_tree_delete(&dev->reada_extents, index);
}
- BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -554,17 +543,18 @@ static void reada_control_release(struct kref *kref)
static int reada_add_block(struct reada_control *rc, u64 logical,
struct btrfs_key *top, u64 generation)
{
- struct btrfs_root *root = rc->root;
+ struct btrfs_fs_info *fs_info = rc->fs_info;
struct reada_extent *re;
struct reada_extctl *rec;
- re = reada_find_extent(root, logical, top); /* takes one ref */
+ /* takes one ref */
+ re = reada_find_extent(fs_info, logical, top);
if (!re)
return -1;
rec = kzalloc(sizeof(*rec), GFP_KERNEL);
if (!rec) {
- reada_extent_put(root->fs_info, re);
+ reada_extent_put(fs_info, re);
return -ENOMEM;
}
@@ -688,7 +678,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->reada_lock);
return 0;
}
- dev->reada_next = re->logical + fs_info->tree_root->nodesize;
+ dev->reada_next = re->logical + fs_info->nodesize;
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -714,12 +704,11 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
logical = re->logical;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info->extent_root, logical,
- mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
if (ret)
- __readahead_hook(fs_info, re, NULL, logical, ret);
+ __readahead_hook(fs_info, re, NULL, ret);
else if (eb)
- __readahead_hook(fs_info, re, eb, eb->start, ret);
+ __readahead_hook(fs_info, re, eb, ret);
if (eb)
free_extent_buffer(eb);
@@ -852,7 +841,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0)
break;
pr_debug(" re: logical %llu size %u empty %d scheduled %d",
- re->logical, fs_info->tree_root->nodesize,
+ re->logical, fs_info->nodesize,
list_empty(&re->extctl), re->scheduled);
for (i = 0; i < re->nzones; ++i) {
@@ -885,7 +874,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
continue;
}
pr_debug("re: logical %llu size %u list empty %d scheduled %d",
- re->logical, fs_info->tree_root->nodesize,
+ re->logical, fs_info->nodesize,
list_empty(&re->extctl), re->scheduled);
for (i = 0; i < re->nzones; ++i) {
pr_cont(" zone %llu-%llu devs",
@@ -924,7 +913,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
if (!rc)
return ERR_PTR(-ENOMEM);
- rc->root = root;
+ rc->fs_info = root->fs_info;
rc->key_start = *key_start;
rc->key_end = *key_end;
atomic_set(&rc->elems, 0);
@@ -952,18 +941,17 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->root->fs_info;
+ struct btrfs_fs_info *fs_info = rc->fs_info;
while (atomic_read(&rc->elems)) {
if (!atomic_read(&fs_info->reada_works_cnt))
reada_start_machine(fs_info);
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
5 * HZ);
- dump_devs(rc->root->fs_info,
- atomic_read(&rc->elems) < 10 ? 1 : 0);
+ dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
}
- dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
+ dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
kref_put(&rc->refcnt, reada_control_release);
@@ -973,7 +961,7 @@ int btrfs_reada_wait(void *handle)
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->root->fs_info;
+ struct btrfs_fs_info *fs_info = rc->fs_info;
while (atomic_read(&rc->elems)) {
if (!atomic_read(&fs_info->reada_works_cnt))
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c4af0cdb783d..379711048fb0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1288,9 +1288,10 @@ fail:
*/
static int __must_check __add_reloc_root(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node;
- struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct reloc_control *rc = fs_info->reloc_ctl;
node = kmalloc(sizeof(*node), GFP_NOFS);
if (!node)
@@ -1304,7 +1305,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
- btrfs_panic(root->fs_info, -EEXIST,
+ btrfs_panic(fs_info, -EEXIST,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
kfree(node);
@@ -1321,9 +1322,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
*/
static void __del_reloc_root(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node = NULL;
- struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct reloc_control *rc = fs_info->reloc_ctl;
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
@@ -1338,9 +1340,9 @@ static void __del_reloc_root(struct btrfs_root *root)
return;
BUG_ON((struct btrfs_root *)node->data != root);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
list_del_init(&root->root_list);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
kfree(node);
}
@@ -1350,9 +1352,10 @@ static void __del_reloc_root(struct btrfs_root *root)
*/
static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node = NULL;
- struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct reloc_control *rc = fs_info->reloc_ctl;
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
@@ -1380,11 +1383,11 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
- u64 last_snap = 0;
int ret;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1395,14 +1398,22 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
root_key.offset = objectid;
if (root->root_key.objectid == objectid) {
+ u64 commit_root_gen;
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
-
- last_snap = btrfs_root_last_snapshot(&root->root_item);
- btrfs_set_root_last_snapshot(&root->root_item,
- trans->transid - 1);
+ /*
+ * Set the last_snapshot field to the generation of the commit
+ * root - like this ctree.c:btrfs_block_can_be_shared() behaves
+ * correctly (returns true) when the relocation root is created
+ * either inside the critical section of a transaction commit
+ * (through transaction.c:qgroup_account_snapshot()) and when
+ * it's created before the transaction commit is started.
+ */
+ commit_root_gen = btrfs_header_generation(root->commit_root);
+ btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
} else {
/*
* called by btrfs_reloc_post_snapshot_hook.
@@ -1426,23 +1437,17 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
root_item->drop_level = 0;
- /*
- * abuse rtransid, it is safe because it is impossible to
- * receive data into a relocation tree.
- */
- btrfs_set_root_rtransid(root_item, last_snap);
- btrfs_set_root_otransid(root_item, trans->transid);
}
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
BUG_ON(ret);
kfree(root_item);
- reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
+ reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
return reloc_root;
@@ -1455,8 +1460,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
- struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct reloc_control *rc = fs_info->reloc_ctl;
struct btrfs_block_rsv *rsv;
int clear_rsv = 0;
int ret;
@@ -1492,6 +1498,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
int ret;
@@ -1502,7 +1509,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
- if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+ if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
__del_reloc_root(reloc_root);
@@ -1514,7 +1521,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root->commit_root = btrfs_root_node(reloc_root);
}
- ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
BUG_ON(ret);
@@ -1642,6 +1649,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *leaf)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct inode *inode = NULL;
@@ -1698,8 +1706,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
- root->sectorsize));
- WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ fs_info->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
key.offset, end);
@@ -1727,7 +1735,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
- ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+ ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
key.objectid, key.offset);
@@ -1736,7 +1744,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
break;
}
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+ ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
@@ -1777,6 +1785,7 @@ int replace_path(struct btrfs_trans_handle *trans,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
{
+ struct btrfs_fs_info *fs_info = dest->fs_info;
struct extent_buffer *eb;
struct extent_buffer *parent;
struct btrfs_key key;
@@ -1834,7 +1843,7 @@ again:
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
- blocksize = dest->nodesize;
+ blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
@@ -1860,7 +1869,7 @@ again:
break;
}
- eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
+ eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -1901,6 +1910,29 @@ again:
BUG_ON(ret);
/*
+ * Info qgroup to trace both subtrees.
+ *
+ * We must trace both trees.
+ * 1) Tree reloc subtree
+ * If not traced, we will leak data numbers
+ * 2) Fs subtree
+ * If not traced, we will double count old data
+ * and tree block numbers, if current trans doesn't free
+ * data reloc tree inode.
+ */
+ ret = btrfs_qgroup_trace_subtree(trans, src, parent,
+ btrfs_header_generation(parent),
+ btrfs_header_level(parent));
+ if (ret < 0)
+ break;
+ ret = btrfs_qgroup_trace_subtree(trans, dest,
+ path->nodes[level],
+ btrfs_header_generation(path->nodes[level]),
+ btrfs_header_level(path->nodes[level]));
+ if (ret < 0)
+ break;
+
+ /*
* swap blocks in fs tree and reloc tree.
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
@@ -1913,21 +1945,21 @@ again:
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(path->nodes[level]);
- ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
- path->nodes[level]->start,
+ ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
+ blocksize, path->nodes[level]->start,
src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
- ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
- 0, dest->root_key.objectid, level - 1,
- 0);
+ ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
+ blocksize, 0, dest->root_key.objectid,
+ level - 1, 0);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+ ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
path->nodes[level]->start,
src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
- ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+ ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
0);
BUG_ON(ret);
@@ -1986,6 +2018,7 @@ static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb = NULL;
int i;
u64 bytenr;
@@ -2016,7 +2049,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- eb = read_tree_block(root, bytenr, ptr_gen);
+ eb = read_tree_block(fs_info, bytenr, ptr_gen);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2038,6 +2071,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
struct btrfs_key *min_key,
struct btrfs_key *max_key)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = NULL;
u64 objectid;
u64 start, end;
@@ -2072,7 +2106,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
start = 0;
else {
start = min_key->offset;
- WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+ WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
}
} else {
start = 0;
@@ -2087,7 +2121,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
if (max_key->offset == 0)
continue;
end = max_key->offset;
- WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+ WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
}
} else {
@@ -2127,6 +2161,7 @@ static int find_next_key(struct btrfs_path *path, int level,
static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
@@ -2175,7 +2210,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
@@ -2236,10 +2271,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path->slots[level]);
root_item->drop_level = level;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction_throttle(trans);
trans = NULL;
- btrfs_btree_balance_dirty(root);
+ btrfs_btree_balance_dirty(fs_info);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2267,9 +2302,9 @@ out:
}
if (trans)
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction_throttle(trans);
- btrfs_btree_balance_dirty(root);
+ btrfs_btree_balance_dirty(fs_info);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
@@ -2281,16 +2316,17 @@ static noinline_for_stack
int prepare_to_merge(struct reloc_control *rc, int err)
{
struct btrfs_root *root = rc->extent_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct btrfs_trans_handle *trans;
LIST_HEAD(reloc_roots);
u64 num_bytes = 0;
int ret;
- mutex_lock(&root->fs_info->reloc_mutex);
- rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ mutex_lock(&fs_info->reloc_mutex);
+ rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
rc->merging_rsv_size += rc->nodes_relocated * 2;
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
again:
if (!err) {
@@ -2304,16 +2340,16 @@ again:
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
if (!err)
- btrfs_block_rsv_release(rc->extent_root,
- rc->block_rsv, num_bytes);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv,
+ num_bytes);
return PTR_ERR(trans);
}
if (!err) {
if (num_bytes != rc->merging_rsv_size) {
- btrfs_end_transaction(trans, rc->extent_root);
- btrfs_block_rsv_release(rc->extent_root,
- rc->block_rsv, num_bytes);
+ btrfs_end_transaction(trans);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv,
+ num_bytes);
goto again;
}
}
@@ -2325,8 +2361,7 @@ again:
struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
- root = read_fs_root(reloc_root->fs_info,
- reloc_root->root_key.offset);
+ root = read_fs_root(fs_info, reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
@@ -2344,9 +2379,9 @@ again:
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans, rc->extent_root);
+ btrfs_commit_transaction(trans);
else
- btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_end_transaction(trans);
return err;
}
@@ -2369,11 +2404,9 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
void merge_reloc_roots(struct reloc_control *rc)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
- u64 last_snap;
- u64 otransid;
- u64 objectid;
LIST_HEAD(reloc_roots);
int found = 0;
int ret = 0;
@@ -2386,9 +2419,9 @@ again:
* adding their roots to the list while we are
* doing this splice
*/
- mutex_lock(&root->fs_info->reloc_mutex);
+ mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
while (!list_empty(&reloc_roots)) {
found = 1;
@@ -2396,7 +2429,7 @@ again:
struct btrfs_root, root_list);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- root = read_fs_root(reloc_root->fs_info,
+ root = read_fs_root(fs_info,
reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
@@ -2412,14 +2445,6 @@ again:
list_del_init(&reloc_root->root_list);
}
- /*
- * we keep the old last snapshot transid in rtranid when we
- * created the relocation tree.
- */
- last_snap = btrfs_root_rtransid(&reloc_root->root_item);
- otransid = btrfs_root_otransid(&reloc_root->root_item);
- objectid = reloc_root->root_key.offset;
-
ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
if (ret < 0) {
if (list_empty(&reloc_root->root_list))
@@ -2435,14 +2460,14 @@ again:
}
out:
if (ret) {
- btrfs_handle_fs_error(root->fs_info, ret, NULL);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
/* new reloc root may be added */
- mutex_lock(&root->fs_info->reloc_mutex);
+ mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
}
@@ -2464,12 +2489,13 @@ static void free_block_list(struct rb_root *blocks)
static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *reloc_root)
{
+ struct btrfs_fs_info *fs_info = reloc_root->fs_info;
struct btrfs_root *root;
if (reloc_root->last_trans == trans->transid)
return 0;
- root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+ root = read_fs_root(fs_info, reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
@@ -2579,6 +2605,7 @@ static noinline_for_stack
u64 calcu_metadata_size(struct reloc_control *rc,
struct backref_node *node, int reserve)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct backref_node *next = node;
struct backref_edge *edge;
struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
@@ -2593,7 +2620,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
if (next->processed && (reserve || next != node))
break;
- num_bytes += rc->extent_root->nodesize;
+ num_bytes += fs_info->nodesize;
if (list_empty(&next->upper))
break;
@@ -2613,6 +2640,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct backref_node *node)
{
struct btrfs_root *root = rc->extent_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_bytes;
int ret;
u64 tmp;
@@ -2630,7 +2658,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
+ tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
tmp <<= 1;
/*
@@ -2640,8 +2668,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* space for relocation and we will return eailer in
* enospc case.
*/
- rc->block_rsv->size = tmp + rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
+ rc->block_rsv->size = tmp + fs_info->nodesize *
+ RELOCATION_RESERVED_NODES;
return -EAGAIN;
}
@@ -2661,6 +2689,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct backref_node *upper;
struct backref_edge *edge;
struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
@@ -2741,9 +2770,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
- blocksize = root->nodesize;
+ blocksize = root->fs_info->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(root, bytenr, generation);
+ eb = read_tree_block(fs_info, bytenr, generation);
if (IS_ERR(eb)) {
err = PTR_ERR(eb);
goto next;
@@ -2772,7 +2801,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(upper->eb);
- ret = btrfs_inc_extent_ref(trans, root,
+ ret = btrfs_inc_extent_ref(trans, root->fs_info,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
@@ -2854,7 +2883,7 @@ static void __mark_block_processed(struct reloc_control *rc,
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
- blocksize = rc->extent_root->nodesize;
+ blocksize = rc->extent_root->fs_info->nodesize;
mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
@@ -2894,7 +2923,7 @@ static void update_processed_blocks(struct reloc_control *rc,
static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
- u32 blocksize = rc->extent_root->nodesize;
+ u32 blocksize = rc->extent_root->fs_info->nodesize;
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
@@ -2902,14 +2931,13 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
return 0;
}
-static int get_tree_block_key(struct reloc_control *rc,
+static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct tree_block *block)
{
struct extent_buffer *eb;
BUG_ON(block->key_ready);
- eb = read_tree_block(rc->extent_root, block->bytenr,
- block->key.offset);
+ eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2988,6 +3016,7 @@ static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
@@ -3005,7 +3034,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- readahead_tree_block(rc->extent_root, block->bytenr);
+ readahead_tree_block(fs_info, block->bytenr);
rb_node = rb_next(rb_node);
}
@@ -3013,7 +3042,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready) {
- err = get_tree_block_key(rc, block);
+ err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
@@ -3107,7 +3136,7 @@ static noinline_for_stack
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -3120,7 +3149,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -3141,6 +3170,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 page_start;
u64 page_end;
u64 offset = BTRFS_I(inode)->index_cnt;
@@ -3236,7 +3266,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
index++;
balance_dirty_pages_ratelimited(inode->i_mapping);
- btrfs_throttle(BTRFS_I(inode)->root);
+ btrfs_throttle(fs_info);
}
WARN_ON(nr != cluster->nr);
out:
@@ -3376,7 +3406,7 @@ static int add_tree_block(struct reloc_control *rc,
return -ENOMEM;
block->bytenr = extent_key->objectid;
- block->key.objectid = rc->extent_root->nodesize;
+ block->key.objectid = rc->extent_root->fs_info->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
@@ -3395,11 +3425,11 @@ static int __add_tree_block(struct reloc_control *rc,
u64 bytenr, u32 blocksize,
struct rb_root *blocks)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
- SKINNY_METADATA);
+ bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
if (tree_block_processed(bytenr, rc))
return 0;
@@ -3465,7 +3495,7 @@ static int block_use_full_backref(struct reloc_control *rc,
btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
return 1;
- ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+ ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
eb->start, btrfs_header_level(eb), 1,
NULL, &flags);
BUG_ON(ret);
@@ -3502,7 +3532,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
}
truncate:
- ret = btrfs_check_trunc_cache_free_space(root,
+ ret = btrfs_check_trunc_cache_free_space(fs_info,
&fs_info->global_block_rsv);
if (ret)
goto out;
@@ -3515,8 +3545,8 @@ truncate:
ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
out:
iput(inode);
return ret;
@@ -3532,6 +3562,7 @@ static int find_data_references(struct reloc_control *rc,
struct btrfs_extent_data_ref *ref,
struct rb_root *blocks)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_path *path;
struct tree_block *block;
struct btrfs_root *root;
@@ -3558,8 +3589,7 @@ static int find_data_references(struct reloc_control *rc,
* it and redo the search.
*/
if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
- ret = delete_block_group_cache(rc->extent_root->fs_info,
- rc->block_group,
+ ret = delete_block_group_cache(fs_info, rc->block_group,
NULL, ref_objectid);
if (ret != -ENOENT)
return ret;
@@ -3571,7 +3601,7 @@ static int find_data_references(struct reloc_control *rc,
return -ENOMEM;
path->reada = READA_FORWARD;
- root = read_fs_root(rc->extent_root->fs_info, ref_root);
+ root = read_fs_root(fs_info, ref_root);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
@@ -3706,7 +3736,7 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize = rc->extent_root->nodesize;
+ u32 blocksize = rc->extent_root->fs_info->nodesize;
int ret = 0;
int err = 0;
@@ -3797,6 +3827,7 @@ static noinline_for_stack
int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
struct btrfs_key *extent_key)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_key key;
struct extent_buffer *leaf;
u64 start, end, last;
@@ -3848,7 +3879,7 @@ next:
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
- key.objectid + rc->extent_root->nodesize <=
+ key.objectid + fs_info->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
@@ -3866,7 +3897,7 @@ next:
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
- rc->extent_root->nodesize;
+ fs_info->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
@@ -3913,7 +3944,7 @@ int prepare_to_relocate(struct reloc_control *rc)
struct btrfs_trans_handle *trans;
int ret;
- rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
BTRFS_BLOCK_RSV_TEMP);
if (!rc->block_rsv)
return -ENOMEM;
@@ -3924,7 +3955,7 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
rc->reserved_bytes = 0;
- rc->block_rsv->size = rc->extent_root->nodesize *
+ rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
RELOCATION_RESERVED_NODES;
ret = btrfs_block_rsv_refill(rc->extent_root,
rc->block_rsv, rc->block_rsv->size,
@@ -3945,96 +3976,13 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans, rc->extent_root);
+ btrfs_commit_transaction(trans);
return 0;
}
-/*
- * Qgroup fixer for data chunk relocation.
- * The data relocation is done in the following steps
- * 1) Copy data extents into data reloc tree
- * 2) Create tree reloc tree(special snapshot) for related subvolumes
- * 3) Modify file extents in tree reloc tree
- * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
- *
- * The problem is, data and tree reloc tree are not accounted to qgroup,
- * and 4) will only info qgroup to track tree blocks change, not file extents
- * in the tree blocks.
- *
- * The good news is, related data extents are all in data reloc tree, so we
- * only need to info qgroup to track all file extents in data reloc tree
- * before commit trans.
- */
-static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
- struct reloc_control *rc)
-{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct inode *inode = rc->data_inode;
- struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
- struct btrfs_key key;
- int ret = 0;
-
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return 0;
-
- /*
- * Only for stage where we update data pointers the qgroup fix is
- * valid.
- * For MOVING_DATA stage, we will miss the timing of swapping tree
- * blocks, and won't fix it.
- */
- if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
-
- ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
- while (1) {
- struct btrfs_file_extent_item *fi;
-
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid > btrfs_ino(inode))
- break;
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- goto next;
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(path->nodes[0], fi) !=
- BTRFS_FILE_EXTENT_REG)
- goto next;
- ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
- btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
- btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
- GFP_NOFS);
- if (ret < 0)
- break;
-next:
- ret = btrfs_next_item(data_reloc_root, path);
- if (ret < 0)
- break;
- if (ret > 0) {
- ret = 0;
- break;
- }
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct btrfs_trans_handle *trans = NULL;
@@ -4075,7 +4023,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
restart:
if (update_backref_cache(trans, &rc->backref_cache)) {
- btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_end_transaction(trans);
continue;
}
@@ -4163,8 +4111,8 @@ restart:
}
}
- btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root);
+ btrfs_end_transaction_throttle(trans);
+ btrfs_btree_balance_dirty(fs_info);
trans = NULL;
if (rc->stage == MOVE_DATA_EXTENTS &&
@@ -4179,7 +4127,7 @@ restart:
}
}
if (trans && progress && err == -ENOSPC) {
- ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+ ret = btrfs_force_chunk_alloc(trans, fs_info,
rc->block_group->flags);
if (ret == 1) {
err = 0;
@@ -4192,8 +4140,8 @@ restart:
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
if (trans) {
- btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root);
+ btrfs_end_transaction_throttle(trans);
+ btrfs_btree_balance_dirty(fs_info);
}
if (!err) {
@@ -4207,7 +4155,7 @@ restart:
set_reloc_control(rc);
backref_cache_cleanup(&rc->backref_cache);
- btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
err = prepare_to_merge(rc, err);
@@ -4215,7 +4163,7 @@ restart:
rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
- btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
@@ -4223,16 +4171,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- ret = qgroup_fix_relocated_data_extents(trans, rc);
- if (ret < 0) {
- btrfs_abort_transaction(trans, ret);
- if (!err)
- err = ret;
- goto out_free;
- }
- btrfs_commit_transaction(trans, rc->extent_root);
+ btrfs_commit_transaction(trans);
out_free:
- btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+ btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
return err;
}
@@ -4255,7 +4196,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
@@ -4300,14 +4241,14 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
BTRFS_I(inode)->index_cnt = group->key.objectid;
err = btrfs_orphan_add(trans, inode);
out:
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
if (err) {
if (inode)
iput(inode);
@@ -4333,11 +4274,50 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
}
/*
+ * Print the block group being relocated
+ */
+static void describe_relocation(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ char buf[128]; /* prefixed by a '|' that'll be dropped */
+ u64 flags = block_group->flags;
+
+ /* Shouldn't happen */
+ if (!flags) {
+ strcpy(buf, "|NONE");
+ } else {
+ char *bp = buf;
+
+#define DESCRIBE_FLAG(f, d) \
+ if (flags & BTRFS_BLOCK_GROUP_##f) { \
+ bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
+ flags &= ~BTRFS_BLOCK_GROUP_##f; \
+ }
+ DESCRIBE_FLAG(DATA, "data");
+ DESCRIBE_FLAG(SYSTEM, "system");
+ DESCRIBE_FLAG(METADATA, "metadata");
+ DESCRIBE_FLAG(RAID0, "raid0");
+ DESCRIBE_FLAG(RAID1, "raid1");
+ DESCRIBE_FLAG(DUP, "dup");
+ DESCRIBE_FLAG(RAID10, "raid10");
+ DESCRIBE_FLAG(RAID5, "raid5");
+ DESCRIBE_FLAG(RAID6, "raid6");
+ if (flags)
+ snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
+#undef DESCRIBE_FLAG
+ }
+
+ btrfs_info(fs_info,
+ "relocating block group %llu flags %s",
+ block_group->key.objectid, buf + 1);
+}
+
+/*
* function to relocate all extents in a block group.
*/
-int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
- struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
struct btrfs_path *path;
@@ -4388,9 +4368,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
goto out;
}
- btrfs_info(extent_root->fs_info,
- "relocating block group %llu flags %llu",
- rc->block_group->key.objectid, rc->block_group->flags);
+ describe_relocation(fs_info, rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
@@ -4410,8 +4388,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
if (rc->extents_found == 0)
break;
- btrfs_info(extent_root->fs_info, "found %llu extents",
- rc->extents_found);
+ btrfs_info(fs_info, "found %llu extents", rc->extents_found);
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
ret = btrfs_wait_ordered_range(rc->data_inode, 0,
@@ -4431,7 +4408,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
if (err && rw)
- btrfs_dec_block_group_ro(extent_root, rc->block_group);
+ btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
kfree(rc);
@@ -4440,10 +4417,11 @@ out:
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret, err;
- trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -4451,10 +4429,10 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
sizeof(root->root_item.drop_progress));
root->root_item.drop_level = 0;
btrfs_set_root_refs(&root->root_item, 0);
- ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
- err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ err = btrfs_end_transaction(trans);
if (err)
return err;
return ret;
@@ -4468,6 +4446,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
*/
int btrfs_recover_relocation(struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
@@ -4489,7 +4468,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
path, 0, 0);
if (ret < 0) {
err = ret;
@@ -4517,7 +4496,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
list_add(&reloc_root->root_list, &reloc_roots);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- fs_root = read_fs_root(root->fs_info,
+ fs_root = read_fs_root(fs_info,
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
@@ -4543,13 +4522,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = alloc_reloc_control(root->fs_info);
+ rc = alloc_reloc_control(fs_info);
if (!rc) {
err = -ENOMEM;
goto out;
}
- rc->extent_root = root->fs_info->extent_root;
+ rc->extent_root = fs_info->extent_root;
set_reloc_control(rc);
@@ -4573,8 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
continue;
}
- fs_root = read_fs_root(root->fs_info,
- reloc_root->root_key.offset);
+ fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
goto out_free;
@@ -4585,7 +4563,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root->reloc_root = reloc_root;
}
- err = btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans);
if (err)
goto out_free;
@@ -4598,12 +4576,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
err = PTR_ERR(trans);
goto out_free;
}
- err = qgroup_fix_relocated_data_extents(trans, rc);
- if (err < 0) {
- btrfs_abort_transaction(trans, err);
- goto out_free;
- }
- err = btrfs_commit_transaction(trans, rc->extent_root);
+ err = btrfs_commit_transaction(trans);
out_free:
kfree(rc);
out:
@@ -4614,8 +4587,7 @@ out:
if (err == 0) {
/* cleanup orphan inode in data relocation tree */
- fs_root = read_fs_root(root->fs_info,
- BTRFS_DATA_RELOC_TREE_OBJECTID);
+ fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
if (IS_ERR(fs_root))
err = PTR_ERR(fs_root);
else
@@ -4632,9 +4604,9 @@ out:
*/
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
u64 disk_bytenr;
u64 new_bytenr;
@@ -4644,7 +4616,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+ ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
goto out;
@@ -4679,13 +4651,14 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct reloc_control *rc;
struct backref_node *node;
int first_cow = 0;
int level;
int ret = 0;
- rc = root->fs_info->reloc_ctl;
+ rc = fs_info->reloc_ctl;
if (!rc)
return 0;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index edae751e870c..4c6735491ee0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,6 +132,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
int ret;
@@ -150,9 +151,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
}
if (ret != 0) {
- btrfs_print_leaf(root, path->nodes[0]);
- btrfs_crit(root->fs_info,
- "unable to update root key %llu %u %llu",
+ btrfs_print_leaf(fs_info, path->nodes[0]);
+ btrfs_crit(fs_info, "unable to update root key %llu %u %llu",
key->objectid, key->type, key->offset);
BUG_ON(1);
}
@@ -216,8 +216,9 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
-int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
@@ -227,7 +228,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
int ret;
bool can_recover = true;
- if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+ if (fs_info->sb->s_flags & MS_RDONLY)
can_recover = false;
path = btrfs_alloc_path();
@@ -275,8 +276,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
* in turn reads and inserts fs roots while doing backref
* walking.
*/
- root = btrfs_lookup_fs_root(tree_root->fs_info,
- root_key.objectid);
+ root = btrfs_lookup_fs_root(fs_info, root_key.objectid);
if (root) {
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state));
@@ -297,15 +297,15 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_handle_fs_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(fs_info, err,
"Failed to start trans to delete orphan item");
break;
}
err = btrfs_del_orphan_item(trans, tree_root,
root_key.objectid);
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction(trans);
if (err) {
- btrfs_handle_fs_error(tree_root->fs_info, err,
+ btrfs_handle_fs_error(fs_info, err,
"Failed to delete root orphan item");
break;
}
@@ -320,7 +320,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
- err = btrfs_insert_fs_root(root->fs_info, root);
+ err = btrfs_insert_fs_root(fs_info, root);
if (err) {
BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
@@ -358,11 +358,12 @@ out:
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len)
{
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -429,10 +430,11 @@ out:
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *tree_root,
+ struct btrfs_fs_info *fs_info,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len)
{
+ struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_key key;
int ret;
struct btrfs_path *path;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fffb9ab8526e..9a94670536a6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -171,7 +171,7 @@ struct scrub_wr_ctx {
struct scrub_ctx {
struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
- struct btrfs_root *dev_root;
+ struct btrfs_fs_info *fs_info;
int first_free;
int curr;
atomic_t bios_in_flight;
@@ -356,7 +356,7 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
*/
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
atomic_inc(&sctx->refs);
/*
@@ -388,7 +388,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
/* used for workers that require transaction commits */
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
{
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
/*
* see scrub_pending_trans_workers_inc() why we're pretending
@@ -458,7 +458,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
- struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
@@ -468,7 +468,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
- sctx->dev_root = dev->dev_root;
+ sctx->fs_info = dev->fs_info;
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -489,8 +489,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx->bios[i]->next_free = -1;
}
sctx->first_free = 0;
- sctx->nodesize = dev->dev_root->nodesize;
- sctx->sectorsize = dev->dev_root->sectorsize;
+ sctx->nodesize = fs_info->nodesize;
+ sctx->sectorsize = fs_info->sectorsize;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
@@ -524,7 +524,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
- struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
@@ -618,7 +618,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
WARN_ON(sblock->page_count < 1);
dev = sblock->pagev[0]->dev;
- fs_info = sblock->sctx->dev_root->fs_info;
+ fs_info = sblock->sctx->fs_info;
path = btrfs_alloc_path();
if (!path)
@@ -789,6 +789,7 @@ out:
static void scrub_fixup_nodatasum(struct btrfs_work *work)
{
+ struct btrfs_fs_info *fs_info;
int ret;
struct scrub_fixup_nodatasum *fixup;
struct scrub_ctx *sctx;
@@ -798,6 +799,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
fixup = container_of(work, struct scrub_fixup_nodatasum, work);
sctx = fixup->sctx;
+ fs_info = fixup->root->fs_info;
path = btrfs_alloc_path();
if (!path) {
@@ -823,9 +825,8 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
* (once it's finished) and rewrite the failed sector if a good copy
* can be found.
*/
- ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
- path, scrub_fixup_readpage,
- fixup);
+ ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
+ scrub_fixup_readpage, fixup);
if (ret < 0) {
uncorrectable = 1;
goto out;
@@ -838,15 +839,14 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
out:
if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans, fixup->root);
+ btrfs_end_transaction(trans);
if (uncorrectable) {
spin_lock(&sctx->stat_lock);
++sctx->stat.uncorrectable_errors;
spin_unlock(&sctx->stat_lock);
btrfs_dev_replace_stats_inc(
- &sctx->dev_root->fs_info->dev_replace.
- num_uncorrectable_read_errors);
- btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ &fs_info->dev_replace.num_uncorrectable_read_errors);
+ btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (nodatasum) error at logical %llu on dev %s",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -898,7 +898,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
- fs_info = sctx->dev_root->fs_info;
+ fs_info = sctx->fs_info;
if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* if we find an error in a super block, we just report it.
@@ -1177,9 +1177,7 @@ nodatasum_case:
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
btrfs_dev_replace_stats_inc(
- &sctx->dev_root->
- fs_info->dev_replace.
- num_write_errors);
+ &fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
@@ -1302,7 +1300,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
struct scrub_ctx *sctx = original_sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
u64 generation = original_sblock->pagev[0]->generation;
@@ -1334,8 +1332,8 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
- ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0, 1);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &mapped_length, &bbio, 0, 1);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
return -EIO;
@@ -1452,7 +1450,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+ ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
page->recover->map_length,
page->mirror_num, 0);
if (ret)
@@ -1565,6 +1563,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
{
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
struct scrub_page *page_good = sblock_good->pagev[page_num];
+ struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
BUG_ON(page_bad->page == NULL);
BUG_ON(page_good->page == NULL);
@@ -1574,7 +1573,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
int ret;
if (!page_bad->dev->bdev) {
- btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
+ btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
@@ -1596,8 +1595,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
btrfs_dev_replace_stats_inc(
- &sblock_bad->sctx->dev_root->fs_info->
- dev_replace.num_write_errors);
+ &fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1609,6 +1607,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
+ struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
int page_num;
/*
@@ -1624,8 +1623,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
btrfs_dev_replace_stats_inc(
- &sblock->sctx->dev_root->fs_info->dev_replace.
- num_write_errors);
+ &fs_info->dev_replace.num_write_errors);
}
}
@@ -1740,7 +1738,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
static void scrub_wr_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
- struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
sbio->err = bio->bi_error;
sbio->bio = bio;
@@ -1759,7 +1757,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
if (sbio->err) {
struct btrfs_dev_replace *dev_replace =
- &sbio->sctx->dev_root->fs_info->dev_replace;
+ &sbio->sctx->fs_info->dev_replace;
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -1859,8 +1857,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
- struct btrfs_root *root = sctx->dev_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
@@ -2126,7 +2123,7 @@ again:
static void scrub_missing_raid56_end_io(struct bio *bio)
{
struct scrub_block *sblock = bio->bi_private;
- struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
if (bio->bi_error)
sblock->no_io_error_seen = 0;
@@ -2140,6 +2137,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 logical;
struct btrfs_device *dev;
@@ -2153,14 +2151,14 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ btrfs_err_rl_in_rcu(fs_info,
"IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ btrfs_err_rl_in_rcu(fs_info,
"failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
@@ -2182,7 +2180,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
struct btrfs_bio *bbio = NULL;
@@ -2191,8 +2189,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
int ret;
int i;
- ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
- &bbio, 0, 1);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &length, &bbio, 0, 1);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2215,7 +2213,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
if (!rbio)
goto rbio_out;
@@ -2334,7 +2332,7 @@ leave_nomem:
static void scrub_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
- struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
sbio->err = bio->bi_error;
sbio->bio = bio;
@@ -2391,7 +2389,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
{
u32 offset;
int nsectors;
- int sectorsize = sparity->sctx->dev_root->sectorsize;
+ int sectorsize = sparity->sctx->fs_info->sectorsize;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
@@ -2750,6 +2748,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
static void scrub_parity_bio_endio(struct bio *bio)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
@@ -2759,13 +2758,13 @@ static void scrub_parity_bio_endio(struct bio *bio)
btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
scrub_parity_bio_endio_worker, NULL, NULL);
- btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
- &sparity->work);
+ btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
struct scrub_page *spage;
@@ -2778,8 +2777,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto out;
length = sparity->logic_end - sparity->logic_start;
- ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
- sparity->logic_start,
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
&length, &bbio, 0, 1);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2792,7 +2790,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+ rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
@@ -2844,7 +2842,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 logic_start,
u64 logic_end)
{
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
@@ -2866,7 +2864,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nsectors = div_u64(map->stripe_len, root->sectorsize);
+ nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
@@ -2937,7 +2935,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
goto next;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->nodesize;
+ bytes = fs_info->nodesize;
else
bytes = key.offset;
@@ -2988,8 +2986,9 @@ again:
mapped_length = extent_len;
bbio = NULL;
- ret = btrfs_map_block(fs_info, READ, extent_logical,
- &mapped_length, &bbio, 0);
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
+ extent_logical, &mapped_length, &bbio,
+ 0);
if (!ret) {
if (!bbio || mapped_length < extent_len)
ret = -EIO;
@@ -3068,7 +3067,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int is_dev_replace)
{
struct btrfs_path *path, *ppath;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
@@ -3289,7 +3288,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
goto next;
if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = root->nodesize;
+ bytes = fs_info->nodesize;
else
bytes = key.offset;
@@ -3442,8 +3441,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_block_group_cache *cache,
int is_dev_replace)
{
- struct btrfs_mapping_tree *map_tree =
- &sctx->dev_root->fs_info->mapping_tree;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
int i;
@@ -3496,8 +3495,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = sctx->dev_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
u64 length;
u64 chunk_offset;
int ret = 0;
@@ -3617,8 +3616,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (IS_ERR(trans))
ret = PTR_ERR(trans);
else
- ret = btrfs_commit_transaction(trans,
- root);
+ ret = btrfs_commit_transaction(trans);
if (ret) {
scrub_pause_off(fs_info);
btrfs_put_block_group(cache);
@@ -3693,7 +3691,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
if (ro_set)
- btrfs_dec_block_group_ro(root, cache);
+ btrfs_dec_block_group_ro(cache);
/*
* We might have prevented the cleaner kthread from deleting
@@ -3746,16 +3744,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
u64 bytenr;
u64 gen;
int ret;
- struct btrfs_root *root = sctx->dev_root;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EIO;
/* Seed devices of a new filesystem has their own generation. */
- if (scrub_dev->fs_devices != root->fs_info->fs_devices)
+ if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
else
- gen = root->fs_info->last_trans_committed;
+ gen = fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
@@ -3847,7 +3845,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (btrfs_fs_closing(fs_info))
return -EINVAL;
- if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
+ if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
* the way scrub is implemented. Do not handle this
@@ -3855,31 +3853,31 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
btrfs_err(fs_info,
"scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
- fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
+ fs_info->nodesize,
+ BTRFS_STRIPE_LEN);
return -EINVAL;
}
- if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
+ if (fs_info->sectorsize != PAGE_SIZE) {
/* not supported for data w/o checksums */
btrfs_err_rl(fs_info,
"scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
- fs_info->chunk_root->sectorsize, PAGE_SIZE);
+ fs_info->sectorsize, PAGE_SIZE);
return -EINVAL;
}
- if (fs_info->chunk_root->nodesize >
+ if (fs_info->nodesize >
PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
- fs_info->chunk_root->sectorsize >
- PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
/*
* would exhaust the array bounds of pagev member in
* struct scrub_block
*/
btrfs_err(fs_info,
"scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->chunk_root->nodesize,
+ fs_info->nodesize,
SCRUB_MAX_PAGES_PER_BLOCK,
- fs_info->chunk_root->sectorsize,
+ fs_info->sectorsize,
SCRUB_MAX_PAGES_PER_BLOCK);
return -EINVAL;
}
@@ -3979,10 +3977,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return ret;
}
-void btrfs_scrub_pause(struct btrfs_root *root)
+void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
mutex_lock(&fs_info->scrub_lock);
atomic_inc(&fs_info->scrub_pause_req);
while (atomic_read(&fs_info->scrubs_paused) !=
@@ -3996,10 +3992,8 @@ void btrfs_scrub_pause(struct btrfs_root *root)
mutex_unlock(&fs_info->scrub_lock);
}
-void btrfs_scrub_continue(struct btrfs_root *root)
+void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
atomic_dec(&fs_info->scrub_pause_req);
wake_up(&fs_info->scrub_pause_wait);
}
@@ -4048,19 +4042,19 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (dev)
sctx = dev->scrub_device;
if (sctx)
memcpy(progress, &sctx->stat, sizeof(*progress));
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
@@ -4076,7 +4070,7 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
int ret;
mapped_length = extent_len;
- ret = btrfs_map_block(fs_info, READ, extent_logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
&mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < extent_len ||
!bbio->stripes[0].dev->bdev) {
@@ -4122,7 +4116,7 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace)
{
struct scrub_copy_nocow_ctx *nocow_ctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
if (!nocow_ctx) {
@@ -4170,20 +4164,17 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
struct scrub_copy_nocow_ctx *nocow_ctx =
container_of(work, struct scrub_copy_nocow_ctx, work);
struct scrub_ctx *sctx = nocow_ctx->sctx;
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
u64 logical = nocow_ctx->logical;
u64 len = nocow_ctx->len;
int mirror_num = nocow_ctx->mirror_num;
u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
int ret;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
- struct btrfs_root *root;
int not_written = 0;
- fs_info = sctx->dev_root->fs_info;
- root = fs_info->extent_root;
-
path = btrfs_alloc_path();
if (!path) {
spin_lock(&sctx->stat_lock);
@@ -4210,7 +4201,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
goto out;
}
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
trans = NULL;
while (!list_empty(&nocow_ctx->inodes)) {
struct scrub_nocow_inode *entry;
@@ -4238,7 +4229,7 @@ out:
kfree(entry);
}
if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
if (not_written)
btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
num_uncorrectable_read_errors);
@@ -4296,7 +4287,7 @@ out_unlock:
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
- struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
struct btrfs_key key;
struct inode *inode;
struct page *page;
@@ -4426,7 +4417,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
if (!dev)
return -EIO;
if (!dev->bdev) {
- btrfs_warn_rl(dev->dev_root->fs_info,
+ btrfs_warn_rl(dev->fs_info,
"scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
@@ -4440,7 +4431,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ret = bio_add_page(bio, page, PAGE_SIZE, 0);
if (ret != PAGE_SIZE) {
leave_with_eio:
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 71261b459863..d145ce804620 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1054,7 +1054,8 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
ret = -ENAMETOOLONG;
goto out;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
+ if (name_len + data_len >
+ BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
ret = -E2BIG;
goto out;
}
@@ -1430,9 +1431,9 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = logical - found_key.objectid;
else
extent_item_pos = 0;
- ret = iterate_extent_inodes(fs_info,
- found_key.objectid, extent_item_pos, 1,
- __iterate_backrefs, backref_ctx);
+ ret = iterate_extent_inodes(fs_info, found_key.objectid,
+ extent_item_pos, 1, __iterate_backrefs,
+ backref_ctx);
if (ret < 0)
goto out;
@@ -3434,6 +3435,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
+ struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key di_key;
@@ -3462,8 +3464,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- di = btrfs_match_dir_item_name(sctx->parent_root, path,
- parent_ref->name, parent_ref->name_len);
+ di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+ parent_ref->name_len);
if (!di) {
ret = 0;
goto out;
@@ -5264,7 +5266,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
u64 size = btrfs_file_extent_inline_len(path->nodes[0],
path->slots[0], fi);
extent_end = ALIGN(key.offset + size,
- sctx->send_root->sectorsize);
+ sctx->send_root->fs_info->sectorsize);
} else {
extent_end = key.offset +
btrfs_file_extent_num_bytes(path->nodes[0], fi);
@@ -5299,7 +5301,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
u64 size = btrfs_file_extent_inline_len(path->nodes[0],
path->slots[0], fi);
extent_end = ALIGN(key->offset + size,
- sctx->send_root->sectorsize);
+ sctx->send_root->fs_info->sectorsize);
} else {
extent_end = key->offset +
btrfs_file_extent_num_bytes(path->nodes[0], fi);
@@ -6110,7 +6112,7 @@ again:
goto commit_trans;
if (trans)
- return btrfs_end_transaction(trans, sctx->send_root);
+ return btrfs_end_transaction(trans);
return 0;
@@ -6123,7 +6125,7 @@ commit_trans:
goto again;
}
- return btrfs_commit_transaction(trans, sctx->send_root);
+ return btrfs_commit_transaction(trans);
}
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
@@ -6136,17 +6138,17 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
*/
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
- "send_in_progres unbalanced %d root %llu",
- root->send_in_progress, root->root_key.objectid);
+ "send_in_progres unbalanced %d root %llu",
+ root->send_in_progress, root->root_key.objectid);
spin_unlock(&root->root_item_lock);
}
long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
{
int ret = 0;
- struct btrfs_root *send_root;
+ struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
- struct btrfs_fs_info *fs_info;
struct btrfs_ioctl_send_args *arg = NULL;
struct btrfs_key key;
struct send_ctx *sctx = NULL;
@@ -6160,9 +6162,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- send_root = BTRFS_I(file_inode(mnt_file))->root;
- fs_info = send_root->fs_info;
-
/*
* The subvolume must remain read-only during send, protect against
* making it RW. This also protects against deletion.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74ed5aae6cea..b5ae7d3d1896 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,27 +202,25 @@ static struct ratelimit_state printk_limits[] = {
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
- char lvl[4];
+ char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
- const char *type = logtypes[4];
int kern_level;
- struct ratelimit_state *ratelimit;
+ const char *type = logtypes[4];
+ struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt);
- kern_level = printk_get_level(fmt);
- if (kern_level) {
+ while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
- memcpy(lvl, fmt, size);
- lvl[size] = '\0';
+
+ if (kern_level >= '0' && kern_level <= '7') {
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ type = logtypes[kern_level - '0'];
+ ratelimit = &printk_limits[kern_level - '0'];
+ }
fmt += size;
- type = logtypes[kern_level - '0'];
- ratelimit = &printk_limits[kern_level - '0'];
- } else {
- *lvl = '\0';
- /* Default to debug output */
- ratelimit = &printk_limits[7];
}
vaf.fmt = fmt;
@@ -305,7 +303,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
static void btrfs_put_super(struct super_block *sb)
{
- close_ctree(btrfs_sb(sb)->tree_root);
+ close_ctree(btrfs_sb(sb));
}
enum {
@@ -396,10 +394,9 @@ static const match_table_t tokens = {
* reading in a new superblock is parsed here.
* XXX JDM: This needs to be cleaned up for remount.
*/
-int btrfs_parse_options(struct btrfs_root *root, char *options,
+int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags)
{
- struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
char *p, *num, *orig = NULL;
u64 cache_gen;
@@ -411,8 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
bool saved_compress_force;
int no_compress = 0;
- cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ cache_gen = btrfs_super_cache_generation(info->super_copy);
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
@@ -442,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
token = match_token(p, tokens, args);
switch (token) {
case Opt_degraded:
- btrfs_info(root->fs_info, "allowing degraded mounts");
+ btrfs_info(info, "allowing degraded mounts");
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
@@ -461,11 +458,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
case Opt_datasum:
if (btrfs_test_opt(info, NODATASUM)) {
if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"setting datasum, datacow enabled");
else
- btrfs_info(root->fs_info,
- "setting datasum");
+ btrfs_info(info, "setting datasum");
}
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -474,11 +470,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
if (!btrfs_test_opt(info, NODATACOW)) {
if (!btrfs_test_opt(info, COMPRESS) ||
!btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"setting nodatacow, compression disabled");
} else {
- btrfs_info(root->fs_info,
- "setting nodatacow");
+ btrfs_info(info, "setting nodatacow");
}
}
btrfs_clear_opt(info->mount_opt, COMPRESS);
@@ -545,8 +540,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
compress_force != saved_compress_force)) ||
(!btrfs_test_opt(info, COMPRESS) &&
no_compress == 1)) {
- btrfs_info(root->fs_info,
- "%s %s compression",
+ btrfs_info(info, "%s %s compression",
(compress_force) ? "force" : "use",
compress_type);
}
@@ -594,10 +588,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
if (info->max_inline) {
info->max_inline = min_t(u64,
info->max_inline,
- root->sectorsize);
+ info->sectorsize);
}
- btrfs_info(root->fs_info, "max_inline at %llu",
- info->max_inline);
+ btrfs_info(info, "max_inline at %llu",
+ info->max_inline);
} else {
ret = -ENOMEM;
goto out;
@@ -610,8 +604,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
info->alloc_start = memparse(num, NULL);
mutex_unlock(&info->chunk_mutex);
kfree(num);
- btrfs_info(root->fs_info,
- "allocations start at %llu",
+ btrfs_info(info, "allocations start at %llu",
info->alloc_start);
} else {
ret = -ENOMEM;
@@ -620,16 +613,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- root->fs_info->sb->s_flags |= MS_POSIXACL;
+ info->sb->s_flags |= MS_POSIXACL;
break;
#else
- btrfs_err(root->fs_info,
- "support for ACL not compiled in!");
+ btrfs_err(info, "support for ACL not compiled in!");
ret = -EINVAL;
goto out;
#endif
case Opt_noacl:
- root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+ info->sb->s_flags &= ~MS_POSIXACL;
break;
case Opt_notreelog:
btrfs_set_and_info(info, NOTREELOG,
@@ -658,8 +650,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
goto out;
} else if (intarg >= 0) {
info->metadata_ratio = intarg;
- btrfs_info(root->fs_info, "metadata ratio %d",
- info->metadata_ratio);
+ btrfs_info(info, "metadata ratio %d",
+ info->metadata_ratio);
} else {
ret = -EINVAL;
goto out;
@@ -677,15 +669,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
case Opt_space_cache_version:
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(root->fs_info->mount_opt,
+ btrfs_clear_opt(info->mount_opt,
FREE_SPACE_TREE);
btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
+ "enabling disk space caching");
} else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(root->fs_info->mount_opt,
+ btrfs_clear_opt(info->mount_opt,
SPACE_CACHE);
- btrfs_set_and_info(info,
- FREE_SPACE_TREE,
+ btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
ret = -EINVAL;
@@ -697,14 +688,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
break;
case Opt_no_space_cache:
if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info,
- SPACE_CACHE,
- "disabling disk space caching");
+ btrfs_clear_and_info(info, SPACE_CACHE,
+ "disabling disk space caching");
}
if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info,
- FREE_SPACE_TREE,
- "disabling free space tree");
+ btrfs_clear_and_info(info, FREE_SPACE_TREE,
+ "disabling free space tree");
}
break;
case Opt_inode_cache:
@@ -737,10 +726,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_warn(root->fs_info,
+ btrfs_warn(info,
"'recovery' is deprecated, use 'usebackuproot' instead");
case Opt_usebackuproot:
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
break;
@@ -749,14 +738,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
break;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
case Opt_check_integrity_including_extent_data:
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"enabling check integrity including extent data");
btrfs_set_opt(info->mount_opt,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
- btrfs_info(root->fs_info, "enabling check integrity");
+ btrfs_info(info, "enabling check integrity");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity_print_mask:
@@ -765,7 +754,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
goto out;
} else if (intarg >= 0) {
info->check_integrity_print_mask = intarg;
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
} else {
@@ -777,8 +766,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
case Opt_check_integrity_including_extent_data:
case Opt_check_integrity:
case Opt_check_integrity_print_mask:
- btrfs_err(root->fs_info,
- "support for check_integrity* not compiled in!");
+ btrfs_err(info,
+ "support for check_integrity* not compiled in!");
ret = -EINVAL;
goto out;
#endif
@@ -798,20 +787,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
intarg = 0;
ret = match_int(&args[0], &intarg);
if (ret < 0) {
- btrfs_err(root->fs_info,
- "invalid commit interval");
+ btrfs_err(info, "invalid commit interval");
ret = -EINVAL;
goto out;
}
if (intarg > 0) {
if (intarg > 300) {
- btrfs_warn(root->fs_info,
+ btrfs_warn(info,
"excessive commit interval %d",
intarg);
}
info->commit_interval = intarg;
} else {
- btrfs_info(root->fs_info,
+ btrfs_info(info,
"using default commit interval %ds",
BTRFS_DEFAULT_COMMIT_INTERVAL);
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
@@ -819,23 +807,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
- btrfs_info(root->fs_info, "fragmenting all space");
+ btrfs_info(info, "fragmenting all space");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
break;
case Opt_fragment_metadata:
- btrfs_info(root->fs_info, "fragmenting metadata");
+ btrfs_info(info, "fragmenting metadata");
btrfs_set_opt(info->mount_opt,
FRAGMENT_METADATA);
break;
case Opt_fragment_data:
- btrfs_info(root->fs_info, "fragmenting data");
+ btrfs_info(info, "fragmenting data");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
break;
#endif
case Opt_err:
- btrfs_info(root->fs_info,
- "unrecognized mount option '%s'", p);
+ btrfs_info(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@@ -847,22 +834,22 @@ check:
* Extra check for current option against current flag
*/
if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
- btrfs_err(root->fs_info,
+ btrfs_err(info,
"nologreplay must be used with ro mount option");
ret = -EINVAL;
}
out:
- if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(root->fs_info, "cannot disable free space tree");
+ btrfs_err(info, "cannot disable free space tree");
ret = -EINVAL;
}
if (!ret && btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(root->fs_info, "disk space caching is enabled");
+ btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(root->fs_info, "using free space tree");
+ btrfs_info(info, "using free space tree");
kfree(orig);
return ret;
}
@@ -1173,7 +1160,7 @@ static int btrfs_fill_super(struct super_block *sb,
return 0;
fail_close:
- close_ctree(fs_info->tree_root);
+ close_ctree(fs_info);
return err;
}
@@ -1217,13 +1204,12 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
if (IS_ERR(trans))
return PTR_ERR(trans);
}
- return btrfs_commit_transaction(trans, root);
+ return btrfs_commit_transaction(trans);
}
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
- struct btrfs_root *root = info->tree_root;
char *compress_type;
if (btrfs_test_opt(info, DEGRADED))
@@ -1265,7 +1251,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD))
seq_puts(seq, ",discard");
- if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+ if (!(info->sb->s_flags & MS_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
seq_puts(seq, ",space_cache");
@@ -1744,7 +1730,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
}
- ret = btrfs_parse_options(root, data, *flags);
+ ret = btrfs_parse_options(fs_info, data, *flags);
if (ret) {
ret = -EINVAL;
goto restore;
@@ -1784,11 +1770,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
- ret = btrfs_commit_super(root);
+ ret = btrfs_commit_super(fs_info);
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -1901,9 +1887,10 @@ static inline void btrfs_descending_sort_devices(
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
-static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -2086,10 +2073,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- /*
- * holding chunk_mutex to avoid allocating new chunks, holding
- * device_list_mutex to avoid the device being removed
- */
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -2141,7 +2124,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_unlock(&block_rsv->lock);
buf->f_bavail = div_u64(total_free_data, factor);
- ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
+ ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
if (ret)
return ret;
buf->f_bavail += div_u64(total_free_data, factor);
@@ -2249,9 +2232,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
static int btrfs_freeze(struct super_block *sb)
{
struct btrfs_trans_handle *trans;
- struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
- root->fs_info->fs_frozen = 1;
+ fs_info->fs_frozen = 1;
/*
* We don't need a barrier here, we'll wait for any transaction that
* could be in progress on other threads (and do delayed iputs that
@@ -2265,14 +2249,12 @@ static int btrfs_freeze(struct super_block *sb)
return 0;
return PTR_ERR(trans);
}
- return btrfs_commit_transaction(trans, root);
+ return btrfs_commit_transaction(trans);
}
static int btrfs_unfreeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb)->tree_root;
-
- root->fs_info->fs_frozen = 0;
+ btrfs_sb(sb)->fs_frozen = 0;
return 0;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad919a95..ea272432c930 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -79,7 +79,7 @@ static void btrfs_destroy_test_fs(void)
unregister_filesystem(&test_type);
}
-struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
GFP_KERNEL);
@@ -100,6 +100,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
return NULL;
}
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+
if (init_srcu_struct(&fs_info->subvol_srcu)) {
kfree(fs_info->fs_devices);
kfree(fs_info->super_copy);
@@ -162,6 +165,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
slot = radix_tree_iter_retry(&iter);
continue;
}
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
@@ -189,7 +193,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
}
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize)
+btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
+ unsigned long length)
{
struct btrfs_block_group_cache *cache;
@@ -206,8 +211,9 @@ btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize)
cache->key.objectid = 0;
cache->key.offset = length;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = sectorsize;
- cache->full_stripe_len = sectorsize;
+ cache->sectorsize = fs_info->sectorsize;
+ cache->full_stripe_len = fs_info->sectorsize;
+ cache->fs_info = fs_info;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b17ffbe8f9f3..266f1e3d1784 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -34,11 +34,11 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
-struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
-btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize);
+btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 199569174637..b9142c614114 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -41,13 +41,13 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
test_msg("Running btrfs_split_item tests\n");
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Could not allocate fs_info\n");
return -ENOMEM;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Could not allocate root\n");
ret = PTR_ERR(root);
@@ -61,8 +61,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize,
- nodesize);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index caad80bb9bd0..133753232a94 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -306,7 +306,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
int ret;
memset(bitmap, 0, len);
- memset_extent_buffer(eb, 0, 0, len);
+ memzero_extent_buffer(eb, 0, len);
if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
test_msg("Bitmap was not zeroed\n");
return -EINVAL;
@@ -383,6 +383,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info;
unsigned long len;
unsigned long *bitmap;
struct extent_buffer *eb;
@@ -397,13 +398,15 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
? sectorsize * 4 : sectorsize;
+ fs_info = btrfs_alloc_dummy_fs_info(len, len);
+
bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
test_msg("Couldn't allocate test bitmap\n");
return -ENOMEM;
}
- eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+ eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
if (!eb) {
test_msg("Couldn't allocate test extent buffer\n");
kfree(bitmap);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 3221c8dee272..eca6412d42bd 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -843,33 +843,31 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info)
+ return -ENOMEM;
+
/*
* For ppc64 (with 64k page size), bytes per bitmap might be
* larger than 1G. To make bitmap test available in ppc64,
* alloc dummy block group whose size cross bitmaps.
*/
- cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize
- + PAGE_SIZE, sectorsize);
+ cache = btrfs_alloc_dummy_block_group(fs_info,
+ BITS_PER_BITMAP * sectorsize + PAGE_SIZE);
if (!cache) {
test_msg("Couldn't run the tests\n");
+ btrfs_free_dummy_fs_info(fs_info);
return 0;
}
- fs_info = btrfs_alloc_dummy_fs_info();
- if (!fs_info) {
- ret = -ENOMEM;
- goto out;
- }
-
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
root->fs_info->extent_root = root;
- cache->fs_info = root->fs_info;
ret = test_extents(cache);
if (ret)
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 6e144048a72e..b29954c01673 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -455,14 +455,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
struct btrfs_path *path = NULL;
int ret;
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
ret = -ENOMEM;
goto out;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Couldn't allocate dummy root\n");
ret = PTR_ERR(root);
@@ -474,8 +474,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->fs_info->free_space_root = root;
root->fs_info->tree_root = root;
- root->node = alloc_test_extent_buffer(root->fs_info,
- nodesize, nodesize);
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -485,7 +484,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
btrfs_set_header_nritems(root->node, 0);
root->alloc_bytenr += 2 * nodesize;
- cache = btrfs_alloc_dummy_block_group(8 * alignment, sectorsize);
+ cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment);
if (!cache) {
test_msg("Couldn't allocate dummy block group cache\n");
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 0bf46808ce8f..4d0f038e14f1 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -249,19 +249,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
+ root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -854,19 +854,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
}
- root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize);
+ root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -950,13 +950,13 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
return ret;
}
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ca7cb5e6d385..0f4ce970d195 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -458,13 +458,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
struct btrfs_root *tmp_root;
int ret = 0;
- fs_info = btrfs_alloc_dummy_fs_info();
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
return -ENOMEM;
}
- root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_msg("Couldn't allocate root\n");
ret = PTR_ERR(root);
@@ -486,8 +486,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, nodesize,
- nodesize);
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -497,7 +496,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
btrfs_set_header_nritems(root->node, 0);
root->alloc_bytenr += 2 * nodesize;
- tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
@@ -512,7 +511,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
goto out;
}
- tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9517de0e668c..0e0508f488b2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -184,10 +184,10 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
/*
* either allocate a new transaction or hop into the existing one
*/
-static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
+static noinline int join_transaction(struct btrfs_fs_info *fs_info,
+ unsigned int type)
{
struct btrfs_transaction *cur_trans;
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
loop:
@@ -314,9 +314,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int force)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
root->last_trans < trans->transid) || force) {
- WARN_ON(root == root->fs_info->extent_root);
+ WARN_ON(root == fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
/*
@@ -331,15 +333,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
- spin_lock(&root->fs_info->fs_roots_radix_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
if (root->last_trans == trans->transid && !force) {
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
- radix_tree_tag_set(&root->fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ radix_tree_tag_set(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@@ -372,6 +374,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
/* Add ourselves to the transaction dropped list */
@@ -380,16 +383,18 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
- spin_lock(&root->fs_info->fs_roots_radix_lock);
- radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&root->fs_info->fs_roots_radix_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
@@ -402,9 +407,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
- mutex_lock(&root->fs_info->reloc_mutex);
+ mutex_lock(&fs_info->reloc_mutex);
record_root_in_trans(trans, root, 0);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
return 0;
}
@@ -420,35 +425,35 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
*/
-static void wait_current_trans(struct btrfs_root *root)
+static void wait_current_trans(struct btrfs_fs_info *fs_info)
{
struct btrfs_transaction *cur_trans;
- spin_lock(&root->fs_info->trans_lock);
- cur_trans = root->fs_info->running_transaction;
+ spin_lock(&fs_info->trans_lock);
+ cur_trans = fs_info->running_transaction;
if (cur_trans && is_transaction_blocked(cur_trans)) {
atomic_inc(&cur_trans->use_count);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
- wait_event(root->fs_info->transaction_wait,
+ wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
cur_trans->aborted);
btrfs_put_transaction(cur_trans);
} else {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
}
}
-static int may_wait_transaction(struct btrfs_root *root, int type)
+static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return 0;
if (type == TRANS_USERSPACE)
return 1;
if (type == TRANS_START &&
- !atomic_read(&root->fs_info->open_ioctl_trans))
+ !atomic_read(&fs_info->open_ioctl_trans))
return 1;
return 0;
@@ -456,7 +461,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
- if (!root->fs_info->reloc_ctl ||
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!fs_info->reloc_ctl ||
!test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
@@ -469,6 +476,8 @@ static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
@@ -479,7 +488,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
/* Send isn't supposed to start transactions. */
ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
- if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -496,23 +505,22 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* Do the reservation before we join the transaction so we can do all
* the appropriate flushing if need be.
*/
- if (num_items > 0 && root != root->fs_info->chunk_root) {
- qgroup_reserved = num_items * root->nodesize;
+ if (num_items > 0 && root != fs_info->chunk_root) {
+ qgroup_reserved = num_items * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
if (ret)
return ERR_PTR(ret);
- num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
/*
* Do the reservation for the relocation root creation
*/
if (need_reserve_reloc_root(root)) {
- num_bytes += root->nodesize;
+ num_bytes += fs_info->nodesize;
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root,
- &root->fs_info->trans_block_rsv,
+ ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
num_bytes, flush);
if (ret)
goto reserve_fail;
@@ -535,15 +543,15 @@ again:
* transaction and commit it, so we needn't do sb_start_intwrite().
*/
if (type & __TRANS_FREEZABLE)
- sb_start_intwrite(root->fs_info->sb);
+ sb_start_intwrite(fs_info->sb);
- if (may_wait_transaction(root, type))
- wait_current_trans(root);
+ if (may_wait_transaction(fs_info, type))
+ wait_current_trans(fs_info);
do {
- ret = join_transaction(root, type);
+ ret = join_transaction(fs_info, type);
if (ret == -EBUSY) {
- wait_current_trans(root);
+ wait_current_trans(fs_info);
if (unlikely(type == TRANS_ATTACH))
ret = -ENOENT;
}
@@ -552,7 +560,7 @@ again:
if (ret < 0)
goto join_fail;
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
h->transid = cur_trans->transid;
h->transaction = cur_trans;
@@ -567,16 +575,16 @@ again:
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
- may_wait_transaction(root, type)) {
+ may_wait_transaction(fs_info, type)) {
current->journal_info = h;
- btrfs_commit_transaction(h, root);
+ btrfs_commit_transaction(h);
goto again;
}
if (num_bytes) {
- trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
- h->block_rsv = &root->fs_info->trans_block_rsv;
+ h->block_rsv = &fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
@@ -590,11 +598,11 @@ got_it:
join_fail:
if (type & __TRANS_FREEZABLE)
- sb_end_intwrite(root->fs_info->sb);
+ sb_end_intwrite(fs_info->sb);
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes)
- btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+ btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
btrfs_qgroup_free_meta(root, qgroup_reserved);
@@ -612,6 +620,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
unsigned int num_items,
int min_factor)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 num_bytes;
int ret;
@@ -624,19 +633,17 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
if (IS_ERR(trans))
return trans;
- num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes,
- min_factor);
+ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
+ ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
+ num_bytes, min_factor);
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ERR_PTR(ret);
}
- trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->block_rsv = &fs_info->trans_block_rsv;
trans->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, num_bytes, 1);
return trans;
@@ -702,30 +709,29 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
trans = start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH);
if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
- btrfs_wait_for_commit(root, 0);
+ btrfs_wait_for_commit(root->fs_info, 0);
return trans;
}
/* wait for a transaction commit to be fully complete */
-static noinline void wait_for_commit(struct btrfs_root *root,
- struct btrfs_transaction *commit)
+static noinline void wait_for_commit(struct btrfs_transaction *commit)
{
wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
}
-int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
+int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
{
struct btrfs_transaction *cur_trans = NULL, *t;
int ret = 0;
if (transid) {
- if (transid <= root->fs_info->last_trans_committed)
+ if (transid <= fs_info->last_trans_committed)
goto out;
/* find specified transaction */
- spin_lock(&root->fs_info->trans_lock);
- list_for_each_entry(t, &root->fs_info->trans_list, list) {
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry(t, &fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
atomic_inc(&cur_trans->use_count);
@@ -737,21 +743,21 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
break;
}
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
/*
* The specified transaction doesn't exist, or we
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
- if (transid > root->fs_info->last_trans_committed)
+ if (transid > fs_info->last_trans_committed)
ret = -EINVAL;
goto out;
}
} else {
/* find newest transaction that is committing | committed */
- spin_lock(&root->fs_info->trans_lock);
- list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+ spin_lock(&fs_info->trans_lock);
+ list_for_each_entry_reverse(t, &fs_info->trans_list,
list) {
if (t->state >= TRANS_STATE_COMMIT_START) {
if (t->state == TRANS_STATE_COMPLETED)
@@ -761,37 +767,38 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
break;
}
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
if (!cur_trans)
goto out; /* nothing committing|committed */
}
- wait_for_commit(root, cur_trans);
+ wait_for_commit(cur_trans);
btrfs_put_transaction(cur_trans);
out:
return ret;
}
-void btrfs_throttle(struct btrfs_root *root)
+void btrfs_throttle(struct btrfs_fs_info *fs_info)
{
- if (!atomic_read(&root->fs_info->open_ioctl_trans))
- wait_current_trans(root);
+ if (!atomic_read(&fs_info->open_ioctl_trans))
+ wait_current_trans(fs_info);
}
-static int should_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int should_end_transaction(struct btrfs_trans_handle *trans)
{
- if (root->fs_info->global_block_rsv.space_info->full &&
- btrfs_check_space_for_delayed_refs(trans, root))
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (fs_info->global_block_rsv.space_info->full &&
+ btrfs_check_space_for_delayed_refs(trans, fs_info))
return 1;
- return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
+ return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
}
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int updates;
int err;
@@ -803,19 +810,19 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates) {
- err = btrfs_run_delayed_refs(trans, root, updates * 2);
+ err = btrfs_run_delayed_refs(trans, fs_info, updates * 2);
if (err) /* Error code will also eval true */
return err;
}
- return should_end_transaction(trans, root);
+ return should_end_transaction(trans);
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int throttle)
+ int throttle)
{
+ struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_fs_info *info = root->fs_info;
u64 transid = trans->transid;
unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
@@ -828,16 +835,16 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_metadata(trans, info);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
+ btrfs_create_pending_block_groups(trans, info);
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans, root);
+ btrfs_should_throttle_delayed_refs(trans, info);
cur = max_t(unsigned long, cur, 32);
/*
@@ -849,16 +856,16 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_metadata(trans, info);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
+ btrfs_create_pending_block_groups(trans, info);
btrfs_trans_release_chunk_metadata(trans);
- if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
- should_end_transaction(trans, root) &&
+ if (lock && !atomic_read(&info->open_ioctl_trans) &&
+ should_end_transaction(trans) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
spin_lock(&info->trans_lock);
if (cur_trans->state == TRANS_STATE_RUNNING)
@@ -868,13 +875,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
if (throttle)
- return btrfs_commit_transaction(trans, root);
+ return btrfs_commit_transaction(trans);
else
wake_up_process(info->transaction_kthread);
}
if (trans->type & __TRANS_FREEZABLE)
- sb_end_intwrite(root->fs_info->sb);
+ sb_end_intwrite(info->sb);
WARN_ON(cur_trans != info->running_transaction);
WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
@@ -893,10 +900,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
current->journal_info = NULL;
if (throttle)
- btrfs_run_delayed_iputs(root);
+ btrfs_run_delayed_iputs(info);
if (trans->aborted ||
- test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
err = -EIO;
}
@@ -904,22 +911,20 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
if (must_run_delayed_refs) {
- btrfs_async_run_delayed_refs(root, cur, transid,
+ btrfs_async_run_delayed_refs(info, cur, transid,
must_run_delayed_refs == 1);
}
return err;
}
-int btrfs_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_end_transaction(struct btrfs_trans_handle *trans)
{
- return __btrfs_end_transaction(trans, root, 0);
+ return __btrfs_end_transaction(trans, 0);
}
-int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
{
- return __btrfs_end_transaction(trans, root, 1);
+ return __btrfs_end_transaction(trans, 1);
}
/*
@@ -927,12 +932,12 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
* them in one of two extent_io trees. This is used to make sure all of
* those extents are sent to disk but does not wait on them
*/
-int btrfs_write_marked_extents(struct btrfs_root *root,
+int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
int err = 0;
int werr = 0;
- struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
@@ -949,11 +954,11 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
* time a temporary error. So when it happens, ignore the error
* and wait for writeback of this range to finish - because we
* failed to set the bit EXTENT_NEED_WAIT for the range, a call
- * to btrfs_wait_marked_extents() would not know that writeback
- * for this range started and therefore wouldn't wait for it to
- * finish - we don't want to commit a superblock that points to
- * btree nodes/leafs for which writeback hasn't finished yet
- * (and without errors).
+ * to __btrfs_wait_marked_extents() would not know that
+ * writeback for this range started and therefore wouldn't
+ * wait for it to finish - we don't want to commit a
+ * superblock that points to btree nodes/leafs for which
+ * writeback hasn't finished yet (and without errors).
* We cleanup any entries left in the io tree when committing
* the transaction (through clear_btree_io_tree()).
*/
@@ -981,16 +986,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
* those extents are on disk for transaction or log commit. We wait
* on all the pages and clear them from the dirty pages state tree
*/
-int btrfs_wait_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages, int mark)
+static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages)
{
int err = 0;
int werr = 0;
- struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
- bool errors = false;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
@@ -1018,27 +1022,45 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
}
if (err)
werr = err;
+ return werr;
+}
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
- if ((mark & EXTENT_DIRTY) &&
- test_and_clear_bit(BTRFS_FS_LOG1_ERR,
- &root->fs_info->flags))
- errors = true;
+int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages)
+{
+ bool errors = false;
+ int err;
- if ((mark & EXTENT_NEW) &&
- test_and_clear_bit(BTRFS_FS_LOG2_ERR,
- &root->fs_info->flags))
- errors = true;
- } else {
- if (test_and_clear_bit(BTRFS_FS_BTREE_ERR,
- &root->fs_info->flags))
- errors = true;
- }
+ err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
+ errors = true;
- if (errors && !werr)
- werr = -EIO;
+ if (errors && !err)
+ err = -EIO;
+ return err;
+}
- return werr;
+int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
+{
+ struct btrfs_fs_info *fs_info = log_root->fs_info;
+ struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
+ bool errors = false;
+ int err;
+
+ ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
+ err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY) &&
+ test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
+ errors = true;
+
+ if ((mark & EXTENT_NEW) &&
+ test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
+ errors = true;
+
+ if (errors && !err)
+ err = -EIO;
+ return err;
}
/*
@@ -1046,7 +1068,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
* them in one of two extent_io trees. This is used to make sure all of
* those extents are on disk for transaction or log commit
*/
-static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
int ret;
@@ -1054,9 +1076,9 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct blk_plug plug;
blk_start_plug(&plug);
- ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+ ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark);
blk_finish_plug(&plug);
- ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+ ret2 = btrfs_wait_extents(fs_info, dirty_pages);
if (ret)
return ret;
@@ -1066,11 +1088,11 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
}
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
int ret;
- ret = btrfs_write_and_wait_marked_extents(root,
+ ret = btrfs_write_and_wait_marked_extents(fs_info,
&trans->transaction->dirty_pages,
EXTENT_DIRTY);
clear_btree_io_tree(&trans->transaction->dirty_pages);
@@ -1094,7 +1116,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
int ret;
u64 old_root_bytenr;
u64 old_root_used;
- struct btrfs_root *tree_root = root->fs_info->tree_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
old_root_used = btrfs_root_used(&root->root_item);
@@ -1125,9 +1148,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
* to clean up the delayed refs.
*/
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
@@ -1143,30 +1165,31 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret)
return ret;
- ret = btrfs_run_dev_stats(trans, root->fs_info);
+ ret = btrfs_run_dev_stats(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_run_dev_replace(trans, root->fs_info);
+ ret = btrfs_run_dev_replace(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_run_qgroups(trans, root->fs_info);
+ ret = btrfs_run_qgroups(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_setup_space_cache(trans, root);
+ ret = btrfs_setup_space_cache(trans, fs_info);
if (ret)
return ret;
/* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret)
return ret;
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+ struct btrfs_root *root;
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
@@ -1178,16 +1201,16 @@ again:
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret)
return ret;
}
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
- ret = btrfs_write_dirty_block_groups(trans, root);
+ ret = btrfs_write_dirty_block_groups(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1209,20 +1232,21 @@ again:
*/
void btrfs_add_dead_root(struct btrfs_root *root)
{
- spin_lock(&root->fs_info->trans_lock);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list))
- list_add_tail(&root->root_list, &root->fs_info->dead_roots);
- spin_unlock(&root->fs_info->trans_lock);
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
+ spin_unlock(&fs_info->trans_lock);
}
/*
* update all the cowonly tree roots on disk
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
struct btrfs_root *gang[8];
- struct btrfs_fs_info *fs_info = root->fs_info;
int i;
int ret;
int err = 0;
@@ -1236,7 +1260,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
if (ret == 0)
break;
for (i = 0; i < ret; i++) {
- root = gang[i];
+ struct btrfs_root *root = gang[i];
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1292,8 +1316,8 @@ int btrfs_defrag_root(struct btrfs_root *root)
ret = btrfs_defrag_leaves(trans, root);
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(info->tree_root);
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(info);
cond_resched();
if (btrfs_fs_closing(info) || ret != -EAGAIN)
@@ -1343,7 +1367,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
*/
mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, src);
+ ret = commit_fs_roots(trans, fs_info);
if (ret)
goto out;
ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
@@ -1372,11 +1396,11 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* like chunk and root tree, as they won't affect qgroup.
* And we don't write super to avoid half committed status.
*/
- ret = commit_cowonly_roots(trans, src);
+ ret = commit_cowonly_roots(trans, fs_info);
if (ret)
goto out;
switch_commit_roots(trans->transaction, fs_info);
- ret = btrfs_write_and_wait_transaction(trans, src);
+ ret = btrfs_write_and_wait_transaction(trans, fs_info);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction for qgroup");
@@ -1462,7 +1486,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
- trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
dentry = pending->dentry;
@@ -1499,7 +1523,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* otherwise we corrupt the FS during
* snapshot
*/
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
if (ret) { /* Transaction aborted */
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1572,7 +1596,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/*
* insert root back/forward references
*/
- ret = btrfs_add_root_ref(trans, tree_root, objectid,
+ ret = btrfs_add_root_ref(trans, fs_info, objectid,
parent_root->root_key.objectid,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
@@ -1582,7 +1606,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
@@ -1595,7 +1619,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1632,14 +1656,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+ ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ ret = btrfs_uuid_tree_add(trans, fs_info,
new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
@@ -1649,7 +1673,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1690,25 +1714,25 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
return ret;
}
-static void update_super_roots(struct btrfs_root *root)
+static void update_super_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root_item *root_item;
struct btrfs_super_block *super;
- super = root->fs_info->super_copy;
+ super = fs_info->super_copy;
- root_item = &root->fs_info->chunk_root->root_item;
+ root_item = &fs_info->chunk_root->root_item;
super->chunk_root = root_item->bytenr;
super->chunk_root_generation = root_item->generation;
super->chunk_root_level = root_item->level;
- root_item = &root->fs_info->tree_root->root_item;
+ root_item = &fs_info->tree_root->root_item;
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
- if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
- if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &root->fs_info->flags))
+ if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
}
@@ -1742,24 +1766,23 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
* wait for the current transaction commit to start and block subsequent
* transaction joins
*/
-static void wait_current_trans_commit_start(struct btrfs_root *root,
+static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *trans)
{
- wait_event(root->fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START ||
- trans->aborted);
+ wait_event(fs_info->transaction_blocked_wait,
+ trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
}
/*
* wait for the current transaction to start and then become unblocked.
* caller holds ref.
*/
-static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
- struct btrfs_transaction *trans)
+static void wait_current_trans_commit_start_and_unblock(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_transaction *trans)
{
- wait_event(root->fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED ||
- trans->aborted);
+ wait_event(fs_info->transaction_wait,
+ trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
}
/*
@@ -1768,7 +1791,6 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
*/
struct btrfs_async_commit {
struct btrfs_trans_handle *newtrans;
- struct btrfs_root *root;
struct work_struct work;
};
@@ -1782,18 +1804,18 @@ static void do_async_commit(struct work_struct *work)
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
+ __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
- btrfs_commit_transaction(ac->newtrans, ac->root);
+ btrfs_commit_transaction(ac->newtrans);
kfree(ac);
}
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
int wait_for_unblock)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_async_commit *ac;
struct btrfs_transaction *cur_trans;
@@ -1802,8 +1824,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
return -ENOMEM;
INIT_WORK(&ac->work, do_async_commit);
- ac->root = root;
- ac->newtrans = btrfs_join_transaction(root);
+ ac->newtrans = btrfs_join_transaction(trans->root);
if (IS_ERR(ac->newtrans)) {
int err = PTR_ERR(ac->newtrans);
kfree(ac);
@@ -1814,22 +1835,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
cur_trans = trans->transaction;
atomic_inc(&cur_trans->use_count);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
/*
* Tell lockdep we've released the freeze rwsem, since the
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
+ __sb_writers_release(fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
/* wait for transaction to start and unblock */
if (wait_for_unblock)
- wait_current_trans_commit_start_and_unblock(root, cur_trans);
+ wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
else
- wait_current_trans_commit_start(root, cur_trans);
+ wait_current_trans_commit_start(fs_info, cur_trans);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1842,6 +1863,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
static void cleanup_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int err)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
DEFINE_WAIT(wait);
@@ -1849,7 +1871,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, err);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
/*
* If the transaction is removed from the list, it means this
@@ -1859,25 +1881,25 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
BUG_ON(list_empty(&cur_trans->list));
list_del_init(&cur_trans->list);
- if (cur_trans == root->fs_info->running_transaction) {
+ if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
}
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
- btrfs_cleanup_one_transaction(trans->transaction, root);
+ btrfs_cleanup_one_transaction(trans->transaction, fs_info);
- spin_lock(&root->fs_info->trans_lock);
- if (cur_trans == root->fs_info->running_transaction)
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
+ if (cur_trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ spin_unlock(&fs_info->trans_lock);
if (trans->type & __TRANS_FREEZABLE)
- sb_end_intwrite(root->fs_info->sb);
+ sb_end_intwrite(fs_info->sb);
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
@@ -1885,7 +1907,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
if (current->journal_info == trans)
current->journal_info = NULL;
- btrfs_scrub_cancel(root->fs_info);
+ btrfs_scrub_cancel(fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
@@ -1910,9 +1932,9 @@ btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
atomic_read(&cur_trans->pending_ordered) == 0);
}
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
@@ -1920,20 +1942,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
/* Stop the commit early if ->aborted is set */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
- ret = btrfs_run_delayed_refs(trans, root, 0);
+ ret = btrfs_run_delayed_refs(trans, fs_info, 0);
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
- btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_metadata(trans, fs_info);
trans->block_rsv = NULL;
cur_trans = trans->transaction;
@@ -1946,11 +1968,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
smp_wmb();
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
+ btrfs_create_pending_block_groups(trans, fs_info);
- ret = btrfs_run_delayed_refs(trans, root, 0);
+ ret = btrfs_run_delayed_refs(trans, fs_info, 0);
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -1970,27 +1992,27 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
- mutex_lock(&root->fs_info->ro_block_group_mutex);
+ mutex_lock(&fs_info->ro_block_group_mutex);
if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
&cur_trans->flags))
run_it = 1;
- mutex_unlock(&root->fs_info->ro_block_group_mutex);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
if (run_it)
- ret = btrfs_start_dirty_block_groups(trans, root);
+ ret = btrfs_start_dirty_block_groups(trans, fs_info);
}
if (ret) {
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_end_transaction(trans);
- wait_for_commit(root, cur_trans);
+ wait_for_commit(cur_trans);
if (unlikely(cur_trans->aborted))
ret = cur_trans->aborted;
@@ -2001,35 +2023,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
cur_trans->state = TRANS_STATE_COMMIT_START;
- wake_up(&root->fs_info->transaction_blocked_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
- if (cur_trans->list.prev != &root->fs_info->trans_list) {
+ if (cur_trans->list.prev != &fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (prev_trans->state != TRANS_STATE_COMPLETED) {
atomic_inc(&prev_trans->use_count);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
- wait_for_commit(root, prev_trans);
+ wait_for_commit(prev_trans);
ret = prev_trans->aborted;
btrfs_put_transaction(prev_trans);
if (ret)
goto cleanup_transaction;
} else {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
}
} else {
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
}
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(root->fs_info);
+ ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
goto cleanup_transaction;
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
if (ret)
goto cleanup_transaction;
@@ -2037,23 +2059,23 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(root->fs_info);
+ btrfs_wait_delalloc_flush(fs_info);
btrfs_wait_pending_ordered(cur_trans);
- btrfs_scrub_pause(root);
+ btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -2067,16 +2089,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* the balancing code from coming in and moving
* extents around in the middle of the commit
*/
- mutex_lock(&root->fs_info->reloc_mutex);
+ mutex_lock(&fs_info->reloc_mutex);
/*
* We needn't worry about the delayed items because we will
* deal with them in create_pending_snapshot(), which is the
* core function of the snapshot creation.
*/
- ret = create_pending_snapshots(trans, root->fs_info);
+ ret = create_pending_snapshots(trans, fs_info);
if (ret) {
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
@@ -2090,22 +2112,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* because all the tree which are snapshoted will be forced to COW
* the nodes and leaves.
*/
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
if (ret) {
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
if (ret) {
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
/* Reocrd old roots for later qgroup accounting */
- ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
if (ret) {
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
@@ -2113,7 +2135,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* make sure none of the code above managed to slip in a
* delayed item
*/
- btrfs_assert_delayed_root_empty(root);
+ btrfs_assert_delayed_root_empty(fs_info);
WARN_ON(cur_trans != trans->transaction);
@@ -2130,12 +2152,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* from now until after the super is written, we avoid races
* with the tree-log code.
*/
- mutex_lock(&root->fs_info->tree_log_mutex);
+ mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, root);
+ ret = commit_fs_roots(trans, fs_info);
if (ret) {
- mutex_unlock(&root->fs_info->tree_log_mutex);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
@@ -2143,28 +2165,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* Since the transaction is done, we can apply the pending changes
* before the next transaction.
*/
- btrfs_apply_pending_changes(root->fs_info);
+ btrfs_apply_pending_changes(fs_info);
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
- btrfs_free_log_root_tree(trans, root->fs_info);
+ btrfs_free_log_root_tree(trans, fs_info);
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
- ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
if (ret < 0) {
- mutex_unlock(&root->fs_info->tree_log_mutex);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = commit_cowonly_roots(trans, root);
+ ret = commit_cowonly_roots(trans, fs_info);
if (ret) {
- mutex_unlock(&root->fs_info->tree_log_mutex);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
@@ -2174,64 +2196,64 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- mutex_unlock(&root->fs_info->tree_log_mutex);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- btrfs_prepare_extent_commit(trans, root);
+ btrfs_prepare_extent_commit(trans, fs_info);
- cur_trans = root->fs_info->running_transaction;
+ cur_trans = fs_info->running_transaction;
- btrfs_set_root_node(&root->fs_info->tree_root->root_item,
- root->fs_info->tree_root->node);
- list_add_tail(&root->fs_info->tree_root->dirty_list,
+ btrfs_set_root_node(&fs_info->tree_root->root_item,
+ fs_info->tree_root->node);
+ list_add_tail(&fs_info->tree_root->dirty_list,
&cur_trans->switch_commits);
- btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
- root->fs_info->chunk_root->node);
- list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ btrfs_set_root_node(&fs_info->chunk_root->root_item,
+ fs_info->chunk_root->node);
+ list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
- switch_commit_roots(cur_trans, root->fs_info);
+ switch_commit_roots(cur_trans, fs_info);
assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
- update_super_roots(root);
+ update_super_roots(fs_info);
- btrfs_set_super_log_root(root->fs_info->super_copy, 0);
- btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
- memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
- sizeof(*root->fs_info->super_copy));
+ btrfs_set_super_log_root(fs_info->super_copy, 0);
+ btrfs_set_super_log_root_level(fs_info->super_copy, 0);
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_copy));
- btrfs_update_commit_device_size(root->fs_info);
- btrfs_update_commit_device_bytes_used(root, cur_trans);
+ btrfs_update_commit_device_size(fs_info);
+ btrfs_update_commit_device_bytes_used(fs_info, cur_trans);
- clear_bit(BTRFS_FS_LOG1_ERR, &root->fs_info->flags);
- clear_bit(BTRFS_FS_LOG2_ERR, &root->fs_info->flags);
+ clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
+ clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
btrfs_trans_release_chunk_metadata(trans);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
- mutex_unlock(&root->fs_info->reloc_mutex);
+ fs_info->running_transaction = NULL;
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->reloc_mutex);
- wake_up(&root->fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_wait);
- ret = btrfs_write_and_wait_transaction(trans, root);
+ ret = btrfs_write_and_wait_transaction(trans, fs_info);
if (ret) {
- btrfs_handle_fs_error(root->fs_info, ret,
- "Error while writing out transaction");
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ btrfs_handle_fs_error(fs_info, ret,
+ "Error while writing out transaction");
+ mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
- ret = write_ctree_super(trans, root, 0);
+ ret = write_ctree_super(trans, fs_info, 0);
if (ret) {
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2239,14 +2261,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* the super is written, we can safely allow the tree-loggers
* to go about their business
*/
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
- btrfs_finish_extent_commit(trans, root);
+ btrfs_finish_extent_commit(trans, fs_info);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
- btrfs_clear_space_info_full(root->fs_info);
+ btrfs_clear_space_info_full(fs_info);
- root->fs_info->last_trans_committed = cur_trans->transid;
+ fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2254,19 +2276,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- spin_lock(&root->fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
- spin_unlock(&root->fs_info->trans_lock);
+ spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
if (trans->type & __TRANS_FREEZABLE)
- sb_end_intwrite(root->fs_info->sb);
+ sb_end_intwrite(fs_info->sb);
- trace_btrfs_transaction_commit(root);
+ trace_btrfs_transaction_commit(trans->root);
- btrfs_scrub_continue(root);
+ btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2277,23 +2299,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* If fs has been frozen, we can not handle delayed iputs, otherwise
* it'll result in deadlock about SB_FREEZE_FS.
*/
- if (current != root->fs_info->transaction_kthread &&
- current != root->fs_info->cleaner_kthread &&
- !root->fs_info->fs_frozen)
- btrfs_run_delayed_iputs(root);
+ if (current != fs_info->transaction_kthread &&
+ current != fs_info->cleaner_kthread && !fs_info->fs_frozen)
+ btrfs_run_delayed_iputs(fs_info);
return ret;
scrub_continue:
- btrfs_scrub_continue(root);
+ btrfs_scrub_continue(fs_info);
cleanup_transaction:
- btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_metadata(trans, fs_info);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
+ btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
- cleanup_transaction(trans, root, ret);
+ cleanup_transaction(trans, trans->root, ret);
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6cf0d37d4f76..5dfb5590fff6 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -123,11 +123,6 @@ struct btrfs_trans_handle {
bool sync;
bool dirty;
unsigned int type;
- /*
- * this root is only needed to validate that the root passed to
- * start_transaction is the same as the one passed to end_transaction.
- * Subvolume quota depends on this
- */
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct seq_list delayed_ref_elem;
@@ -185,8 +180,7 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-int btrfs_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+int btrfs_end_transaction(struct btrfs_trans_handle *trans);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items);
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
@@ -202,27 +196,24 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
-int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
+int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
int wait_for_unblock);
-int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
-void btrfs_throttle(struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+void btrfs_throttle(struct btrfs_fs_info *fs_info);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_write_marked_extents(struct btrfs_root *root,
- struct extent_io_tree *dirty_pages, int mark);
-int btrfs_wait_marked_extents(struct btrfs_root *root,
+int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
+int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *dirty_pages);
+int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3d33c4e41e5f..eeffff84f280 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -37,6 +37,7 @@
*/
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
+#define LOG_OTHER_INODE 2
/*
* directory trouble cases
@@ -142,12 +143,13 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
- if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ if (btrfs_need_log_full_commit(fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
@@ -159,10 +161,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
- mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, root->fs_info);
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_lock(&fs_info->tree_log_mutex);
+ if (!fs_info->log_root_tree)
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ mutex_unlock(&fs_info->tree_log_mutex);
if (ret)
goto out;
@@ -292,25 +294,26 @@ static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
struct walk_control *wc, u64 gen)
{
+ struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
/*
* If this fs is mixed then we need to be able to process the leaves to
* pin down any logged extents, so we have to read the block.
*/
- if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
ret = btrfs_read_buffer(eb, gen);
if (ret)
return ret;
}
if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
- eb->start, eb->len);
+ ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
+ eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
if (wc->pin && btrfs_header_level(eb) == 0)
- ret = btrfs_exclude_logged_extents(log, eb);
+ ret = btrfs_exclude_logged_extents(fs_info, eb);
if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
@@ -339,6 +342,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
u32 item_size;
u64 saved_i_size = 0;
@@ -459,9 +463,9 @@ insert:
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(root, path, item_size, 1);
+ btrfs_truncate_item(fs_info, path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(root, path,
+ btrfs_extend_item(fs_info, path,
item_size - found_size);
} else if (ret) {
return ret;
@@ -582,6 +586,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
u64 start = key->offset;
@@ -608,7 +613,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_inline_len(eb, slot, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
- extent_end = ALIGN(start + size, root->sectorsize);
+ extent_end = ALIGN(start + size,
+ fs_info->sectorsize);
} else {
ret = 0;
goto out;
@@ -689,7 +695,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
- ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+ ret = btrfs_qgroup_trace_extent(trans, fs_info,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item),
GFP_NOFS);
@@ -704,10 +710,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
*/
- ret = btrfs_lookup_data_extent(root, ins.objectid,
+ ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret == 0) {
- ret = btrfs_inc_extent_ref(trans, root,
+ ret = btrfs_inc_extent_ref(trans, fs_info,
ins.objectid, ins.offset,
0, root->root_key.objectid,
key->objectid, offset);
@@ -719,7 +725,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
- root, root->root_key.objectid,
+ fs_info,
+ root->root_key.objectid,
key->objectid, offset, &ins);
if (ret)
goto out;
@@ -796,14 +803,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_del_csums(trans,
- root->fs_info->csum_root,
- sums->bytenr,
- sums->len);
+ ret = btrfs_del_csums(trans, fs_info,
+ sums->bytenr,
+ sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
- root->fs_info->csum_root,
- sums);
+ fs_info->csum_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -841,6 +846,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
char *name;
int name_len;
@@ -873,7 +879,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
else
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
out:
kfree(name);
iput(inode);
@@ -991,6 +997,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, char *name, int namelen,
int *search_done)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *victim_name;
int victim_name_len;
@@ -1049,7 +1056,7 @@ again:
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
if (ret)
return ret;
*search_done = 1;
@@ -1120,7 +1127,8 @@ again:
victim_name_len);
if (!ret)
ret = btrfs_run_delayed_items(
- trans, root);
+ trans,
+ fs_info);
}
iput(victim_parent);
kfree(victim_name);
@@ -1811,6 +1819,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
@@ -1823,7 +1832,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(root, eb, di))
+ if (verify_dir_item(fs_info, eb, di))
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
@@ -1940,12 +1949,11 @@ static noinline int find_dir_range(struct btrfs_root *root,
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
+ path->slots[0]++;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret)
goto out;
- } else {
- path->slots[0]++;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -1978,6 +1986,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct extent_buffer *eb;
int slot;
@@ -1999,7 +2008,7 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
- if (verify_dir_item(root, eb, di)) {
+ if (verify_dir_item(fs_info, eb, di)) {
ret = -EIO;
goto out;
}
@@ -2046,7 +2055,7 @@ again:
ret = btrfs_unlink_inode(trans, root, dir, inode,
name, name_len);
if (!ret)
- ret = btrfs_run_delayed_items(trans, root);
+ ret = btrfs_run_delayed_items(trans, fs_info);
kfree(name);
iput(inode);
if (ret)
@@ -2407,6 +2416,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 root_owner;
u64 bytenr;
u64 ptr_gen;
@@ -2432,12 +2442,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = root->nodesize;
+ blocksize = fs_info->nodesize;
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
- next = btrfs_find_create_tree_block(root, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2459,16 +2469,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root->fs_info,
- next);
+ clean_tree_block(trans, fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(root,
- bytenr, blocksize);
+ ret = btrfs_free_and_pin_reserved_extent(
+ fs_info, bytenr,
+ blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2505,6 +2515,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 root_owner;
int i;
int slot;
@@ -2538,14 +2549,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root->fs_info,
- next);
+ clean_tree_block(trans, fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(root,
+ ret = btrfs_free_and_pin_reserved_extent(
+ fs_info,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
@@ -2567,6 +2578,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
static int walk_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct walk_control *wc)
{
+ struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
int wret;
int level;
@@ -2615,15 +2627,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, log->fs_info, next);
+ clean_tree_block(trans, fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(log, next->start,
- next->len);
+ ret = btrfs_free_and_pin_reserved_extent(fs_info,
+ next->start, next->len);
if (ret)
goto out;
}
@@ -2641,14 +2653,15 @@ out:
static int update_log_root(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
+ struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
if (log->log_transid == 1) {
/* insert root item on the first sync */
- ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+ ret = btrfs_insert_root(trans, fs_info->log_root_tree,
&log->root_key, &log->root_item);
} else {
- ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+ ret = btrfs_update_root(trans, fs_info->log_root_tree,
&log->root_key, &log->root_item);
}
return ret;
@@ -2742,8 +2755,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int index2;
int mark;
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
- struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ struct btrfs_root *log_root_tree = fs_info->log_root_tree;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -2771,7 +2785,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root->fs_info, SSD) &&
+ if (!btrfs_test_opt(fs_info, SSD) &&
test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
@@ -2783,7 +2797,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ if (btrfs_need_log_full_commit(fs_info, trans)) {
ret = -EAGAIN;
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2799,12 +2813,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* wait for them until later.
*/
blk_start_plug(&plug);
- ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2849,14 +2863,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_tree_log_extents(log, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
@@ -2874,8 +2888,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
- ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
- mark);
+ ret = btrfs_wait_tree_log_extents(log, mark);
btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
@@ -2898,43 +2911,42 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ if (btrfs_need_log_full_commit(fs_info, trans)) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ btrfs_wait_tree_log_extents(log, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
}
- ret = btrfs_write_marked_extents(log_root_tree,
+ ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
- ret = btrfs_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_NEW | EXTENT_DIRTY);
+ ret = btrfs_wait_tree_log_extents(log_root_tree,
+ EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
btrfs_wait_logged_extents(trans, log, log_transid);
- btrfs_set_super_log_root(root->fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
+ btrfs_set_super_log_root(fs_info->super_for_commit,
+ log_root_tree->node->start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit,
+ btrfs_header_level(log_root_tree->node));
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
@@ -2946,9 +2958,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
- ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
+ ret = write_ctree_super(trans, fs_info, 1);
if (ret) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
@@ -3182,6 +3194,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct inode *inode, u64 dirid)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log;
u64 index;
int ret;
@@ -3199,7 +3212,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
if (ret == -ENOSPC) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, ret);
@@ -3606,6 +3619,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long src_offset;
unsigned long dst_offset;
struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
@@ -3716,7 +3730,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
}
ret = btrfs_lookup_csums_range(
- log->fs_info->csum_root,
+ fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
if (ret) {
@@ -3789,7 +3803,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_path->slots[0],
extent);
*last_extent = ALIGN(key.offset + len,
- log->sectorsize);
+ fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(src, extent);
*last_extent = key.offset + len;
@@ -3852,7 +3866,8 @@ fill_holes:
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_inline_len(src, i, extent);
- extent_end = ALIGN(key.offset + len, log->sectorsize);
+ extent_end = ALIGN(key.offset + len,
+ fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(src, extent);
extent_end = key.offset + len;
@@ -3902,6 +3917,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
const struct list_head *logged_list,
bool *ordered_io_error)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *log = root->log_root;
u64 mod_start = em->mod_start;
@@ -4018,7 +4034,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+ ret = btrfs_lookup_csums_range(fs_info->csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
@@ -4361,6 +4377,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct btrfs_key key;
u64 hole_start;
@@ -4370,7 +4387,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(inode);
- if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = ino;
@@ -4427,7 +4444,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
if (hole_size == 0)
return 0;
- hole_size = ALIGN(hole_size, root->sectorsize);
+ hole_size = ALIGN(hole_size, fs_info->sectorsize);
ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
hole_size, 0, hole_size, 0, 0, 0);
return ret;
@@ -4585,6 +4602,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
const loff_t end,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
@@ -4624,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode) ||
(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags) &&
- inode_only == LOG_INODE_EXISTS))
+ inode_only >= LOG_INODE_EXISTS))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
@@ -4637,7 +4655,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
*/
if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
+ BTRFS_I(inode)->generation > fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
else
ret = btrfs_commit_inode_delayed_inode(inode);
@@ -4648,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
- mutex_lock(&BTRFS_I(inode)->log_mutex);
+ if (inode_only == LOG_OTHER_INODE) {
+ inode_only = LOG_INODE_EXISTS;
+ mutex_lock_nested(&BTRFS_I(inode)->log_mutex,
+ SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ }
/*
* a brute force approach to making sure we get the most uptodate
@@ -4774,7 +4798,7 @@ again:
inode_key.objectid = other_ino;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
- other_inode = btrfs_iget(root->fs_info->sb,
+ other_inode = btrfs_iget(fs_info->sb,
&inode_key, root,
NULL);
/*
@@ -4800,7 +4824,7 @@ again:
* unpin it.
*/
err = btrfs_log_inode(trans, root, other_inode,
- LOG_INODE_EXISTS,
+ LOG_OTHER_INODE,
0, LLONG_MAX, ctx);
iput(other_inode);
if (err)
@@ -5138,6 +5162,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct inode *start_inode,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_path *path;
LIST_HEAD(dir_list);
@@ -5205,8 +5230,8 @@ process_leaf:
if (di_key.type == BTRFS_ROOT_ITEM_KEY)
continue;
- di_inode = btrfs_iget(root->fs_info->sb, &di_key,
- root, NULL);
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto next_dir_inode;
@@ -5214,13 +5239,12 @@ process_leaf:
if (btrfs_inode_in_log(di_inode, trans->transid)) {
iput(di_inode);
- continue;
+ break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
- btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
log_mode, 0, LLONG_MAX, ctx);
if (!ret &&
@@ -5268,6 +5292,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -5332,7 +5357,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+ dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
/* If parent inode was deleted, skip it. */
if (IS_ERR(dir_inode))
@@ -5374,17 +5399,18 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
int exists_only,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
struct dentry *old_parent = NULL;
int ret = 0;
- u64 last_committed = root->fs_info->last_trans_committed;
+ u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
struct inode *orig_inode = inode;
sb = inode->i_sb;
- if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
+ if (btrfs_test_opt(fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
}
@@ -5393,8 +5419,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* The prev transaction commit doesn't complete, we need do
* full commit by ourselves.
*/
- if (root->fs_info->last_trans_log_full_commit >
- root->fs_info->last_trans_committed) {
+ if (fs_info->last_trans_log_full_commit >
+ fs_info->last_trans_committed) {
ret = 1;
goto end_no_trans;
}
@@ -5515,7 +5541,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_set_log_full_commit(fs_info, trans);
ret = 1;
}
@@ -5675,7 +5701,7 @@ again:
btrfs_free_path(path);
/* step 4: commit the transaction, which also unpins the blocks */
- ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
@@ -5687,7 +5713,7 @@ again:
return 0;
error:
if (wc.trans)
- btrfs_end_transaction(wc.trans, fs_info->tree_root);
+ btrfs_end_transaction(wc.trans);
btrfs_free_path(path);
return ret;
}
@@ -5786,6 +5812,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *old_dir,
struct dentry *parent)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root * root = BTRFS_I(inode)->root;
/*
@@ -5800,9 +5827,9 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* from hasn't been logged, we don't need to log it
*/
if (BTRFS_I(inode)->logged_trans <=
- root->fs_info->last_trans_committed &&
+ fs_info->last_trans_committed &&
(!old_dir || BTRFS_I(old_dir)->logged_trans <=
- root->fs_info->last_trans_committed))
+ fs_info->last_trans_committed))
return 0;
return btrfs_log_inode_parent(trans, root, inode, parent, 0,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 7fc89e4adb41..726f928238d0 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -92,9 +92,10 @@ out:
}
int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
- struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
u64 subid_cpu)
{
+ struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -132,13 +133,13 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
* An item with that type already exists.
* Extend the item and store the new subid at the end.
*/
- btrfs_extend_item(uuid_root, path, sizeof(subid_le));
+ btrfs_extend_item(fs_info, path, sizeof(subid_le));
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
} else if (ret < 0) {
- btrfs_warn(uuid_root->fs_info,
+ btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
ret, (unsigned long long)key.objectid,
(unsigned long long)key.offset, type);
@@ -156,9 +157,10 @@ out:
}
int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
- struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+ struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
u64 subid)
{
+ struct btrfs_root *uuid_root = fs_info->uuid_root;
int ret;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -185,8 +187,8 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn(uuid_root->fs_info,
- "error %d while searching for uuid item!", ret);
+ btrfs_warn(fs_info, "error %d while searching for uuid item!",
+ ret);
goto out;
}
if (ret > 0) {
@@ -199,8 +201,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
offset = btrfs_item_ptr_offset(eb, slot);
item_size = btrfs_item_size_nr(eb, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- btrfs_warn(uuid_root->fs_info,
- "uuid item with illegal size %lu!",
+ btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
ret = -ENOENT;
goto out;
@@ -230,7 +231,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
move_src = offset + sizeof(subid);
move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
memmove_extent_buffer(eb, move_dst, move_src, move_len);
- btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1);
+ btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1);
out:
btrfs_free_path(path);
@@ -250,8 +251,8 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
goto out;
}
- ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid);
- btrfs_end_transaction(trans, uuid_root);
+ ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid);
+ btrfs_end_transaction(trans);
out:
return ret;
@@ -351,7 +352,5 @@ skip:
out:
btrfs_free_path(path);
- if (ret)
- btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
- return 0;
+ return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 71a60cc01451..3c3c69c0eee4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -134,9 +134,9 @@ const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
};
static int init_first_rw_device(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_device *device);
-static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
@@ -343,9 +343,9 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios,
*/
static noinline void run_scheduled_bios(struct btrfs_device *device)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
struct bio *pending;
struct backing_dev_info *bdi;
- struct btrfs_fs_info *fs_info;
struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
@@ -367,7 +367,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
blk_start_plug(&plug);
bdi = blk_get_backing_dev_info(device->bdev);
- fs_info = device->dev_root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
@@ -1179,7 +1178,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length)
{
struct btrfs_key key;
- struct btrfs_root *root = device->dev_root;
+ struct btrfs_root *root = device->fs_info->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 extent_end;
@@ -1262,7 +1261,7 @@ static int contains_pending_extent(struct btrfs_transaction *transaction,
struct btrfs_device *device,
u64 *start, u64 len)
{
- struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_fs_info *fs_info = device->fs_info;
struct extent_map *em;
struct list_head *search_list = &fs_info->pinned_chunks;
int ret = 0;
@@ -1338,8 +1337,9 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
struct btrfs_device *device, u64 num_bytes,
u64 search_start, u64 *start, u64 *len)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
- struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size;
@@ -1357,7 +1357,7 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
* used by the boot loader (grub for example), so we make sure to start
* at an offset of at least 1MB.
*/
- min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ min_search_start = max(fs_info->alloc_start, 1024ull * 1024);
search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
@@ -1508,9 +1508,10 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
int ret;
struct btrfs_path *path;
- struct btrfs_root *root = device->dev_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
@@ -1544,7 +1545,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
+ btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
goto out;
}
@@ -1552,8 +1553,8 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_handle_fs_error(root->fs_info, ret,
- "Failed to remove dev extent item");
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to remove dev extent item");
} else {
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
@@ -1569,7 +1570,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_path *path;
- struct btrfs_root *root = device->dev_root;
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -1595,8 +1597,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
- write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
- btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
+ write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
@@ -1667,9 +1668,10 @@ error:
* the btrfs_device struct should be fully filled in
*/
static int btrfs_add_device(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_dev_item *dev_item;
@@ -1677,8 +1679,6 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_key key;
unsigned long ptr;
- root = root->fs_info->chunk_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1713,7 +1713,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
- write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+ write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
@@ -1737,16 +1737,15 @@ static void update_dev_time(char *path_name)
filp_close(filp, NULL);
}
-static int btrfs_rm_dev_item(struct btrfs_root *root,
+static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_trans_handle *trans;
- root = root->fs_info->chunk_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1774,7 +1773,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
goto out;
out:
btrfs_free_path(path);
- btrfs_commit_transaction(trans, root);
+ btrfs_commit_transaction(trans);
return ret;
}
@@ -1853,7 +1852,7 @@ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
-int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info, char *device_path, u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -1863,20 +1862,20 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
mutex_lock(&uuid_mutex);
- num_devices = root->fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
- if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+ num_devices = fs_info->fs_devices->num_devices;
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
- ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
+ ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- ret = btrfs_find_device_by_devspec(root, devid, device_path,
- &device);
+ ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
+ &device);
if (ret)
goto out;
@@ -1885,16 +1884,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
goto out;
}
- if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+ if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
if (device->writeable) {
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
clear_super = true;
}
@@ -1909,12 +1908,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
- ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+ ret = btrfs_rm_dev_item(fs_info, device);
if (ret)
goto error_undo;
device->in_fs_metadata = 0;
- btrfs_scrub_cancel_dev(root->fs_info, device);
+ btrfs_scrub_cancel_dev(fs_info, device);
/*
* the device list mutex makes sure that we don't change
@@ -1927,7 +1926,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
*/
cur_devices = device->fs_devices;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
device->fs_devices->num_devices--;
@@ -1936,17 +1935,17 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
if (device->missing)
device->fs_devices->missing_devices--;
- btrfs_assign_next_active_device(root->fs_info, device, NULL);
+ btrfs_assign_next_active_device(fs_info, device, NULL);
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
}
- num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
- btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
+ btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/*
* at this point, the device is zero sized and detached from
@@ -1961,7 +1960,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
- fs_devices = root->fs_info->fs_devices;
+ fs_devices = fs_info->fs_devices;
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
@@ -1974,8 +1973,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
free_fs_devices(cur_devices);
}
- root->fs_info->num_tolerated_disk_barrier_failures =
- btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
out:
mutex_unlock(&uuid_mutex);
@@ -1983,11 +1982,11 @@ out:
error_undo:
if (device->writeable) {
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
- &root->fs_info->fs_devices->alloc_list);
+ &fs_info->fs_devices->alloc_list);
device->fs_devices->rw_devices++;
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
}
goto out;
}
@@ -2092,7 +2091,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
call_rcu(&tgtdev->rcu, free_device);
}
-static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
+ char *device_path,
struct btrfs_device **device)
{
int ret = 0;
@@ -2104,14 +2104,13 @@ static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
*device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- root->fs_info->bdev_holder, 0, &bdev, &bh);
+ fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
return ret;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
- disk_super->fsid);
+ *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
brelse(bh);
if (!*device)
ret = -ENOENT;
@@ -2119,7 +2118,7 @@ static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
return ret;
}
-int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
char *device_path,
struct btrfs_device **device)
{
@@ -2128,7 +2127,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
struct list_head *devices;
struct btrfs_device *tmp;
- devices = &root->fs_info->fs_devices->devices;
+ devices = &fs_info->fs_devices->devices;
/*
* It is safe to read the devices since the volume_mutex
* is held by the caller.
@@ -2145,30 +2144,28 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
return 0;
} else {
- return btrfs_find_device_by_path(root, device_path, device);
+ return btrfs_find_device_by_path(fs_info, device_path, device);
}
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
-int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
- char *devpath,
- struct btrfs_device **device)
+int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
+ char *devpath, struct btrfs_device **device)
{
int ret;
if (devid) {
ret = 0;
- *device = btrfs_find_device(root->fs_info, devid, NULL,
- NULL);
+ *device = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!*device)
ret = -ENOENT;
} else {
if (!devpath || !devpath[0])
return -EINVAL;
- ret = btrfs_find_device_missing_or_by_path(root, devpath,
+ ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
device);
}
return ret;
@@ -2177,12 +2174,12 @@ int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
/*
* does all the dirty work required for changing file system's UUID.
*/
-static int btrfs_prepare_sprout(struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
@@ -2208,15 +2205,15 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
@@ -2226,9 +2223,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
- memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
@@ -2241,8 +2238,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
* Store the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
@@ -2257,7 +2255,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- root = root->fs_info->chunk_root;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
@@ -2293,8 +2290,7 @@ next_slot:
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
- device = btrfs_find_device(root->fs_info, devid, dev_uuid,
- fs_uuid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2312,28 +2308,29 @@ error:
return ret;
}
-int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path)
{
+ struct btrfs_root *root = fs_info->dev_root;
struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
- struct super_block *sb = root->fs_info->sb;
+ struct super_block *sb = fs_info->sb;
struct rcu_string *name;
u64 tmp;
int seeding_dev = 0;
int ret = 0;
- if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+ if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding)
return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
- root->fs_info->bdev_holder);
+ fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- if (root->fs_info->fs_devices->seeding) {
+ if (fs_info->fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
@@ -2341,20 +2338,20 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
filemap_write_and_wait(bdev->bd_inode->i_mapping);
- devices = &root->fs_info->fs_devices->devices;
+ devices = &fs_info->fs_devices->devices;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
mutex_unlock(
- &root->fs_info->fs_devices->device_list_mutex);
+ &fs_info->fs_devices->device_list_mutex);
goto error;
}
}
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+ device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
@@ -2382,13 +2379,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->can_discard = 1;
device->writeable = 1;
device->generation = trans->transid;
- device->io_width = root->sectorsize;
- device->io_align = root->sectorsize;
- device->sector_size = root->sectorsize;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->dev_root = root->fs_info->dev_root;
+ device->fs_info = fs_info;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 0;
@@ -2398,61 +2395,60 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
- ret = btrfs_prepare_sprout(root);
+ ret = btrfs_prepare_sprout(fs_info);
BUG_ON(ret); /* -ENOMEM */
}
- device->fs_devices = root->fs_info->fs_devices;
+ device->fs_devices = fs_info->fs_devices;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- lock_chunks(root);
- list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
+ list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
- &root->fs_info->fs_devices->alloc_list);
- root->fs_info->fs_devices->num_devices++;
- root->fs_info->fs_devices->open_devices++;
- root->fs_info->fs_devices->rw_devices++;
- root->fs_info->fs_devices->total_devices++;
- root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+ &fs_info->fs_devices->alloc_list);
+ fs_info->fs_devices->num_devices++;
+ fs_info->fs_devices->open_devices++;
+ fs_info->fs_devices->rw_devices++;
+ fs_info->fs_devices->total_devices++;
+ fs_info->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += device->total_bytes;
- spin_unlock(&root->fs_info->free_chunk_lock);
+ spin_lock(&fs_info->free_chunk_lock);
+ fs_info->free_chunk_space += device->total_bytes;
+ spin_unlock(&fs_info->free_chunk_lock);
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
- root->fs_info->fs_devices->rotating = 1;
+ fs_info->fs_devices->rotating = 1;
- tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
- btrfs_set_super_total_bytes(root->fs_info->super_copy,
+ tmp = btrfs_super_total_bytes(fs_info->super_copy);
+ btrfs_set_super_total_bytes(fs_info->super_copy,
tmp + device->total_bytes);
- tmp = btrfs_super_num_devices(root->fs_info->super_copy);
- btrfs_set_super_num_devices(root->fs_info->super_copy,
- tmp + 1);
+ tmp = btrfs_super_num_devices(fs_info->super_copy);
+ btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
/* add sysfs device entry */
- btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
+ btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
* infos
*/
- btrfs_clear_space_info_full(root->fs_info);
+ btrfs_clear_space_info_full(fs_info);
- unlock_chunks(root);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
- lock_chunks(root);
- ret = init_first_rw_device(trans, root, device);
- unlock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = init_first_rw_device(trans, fs_info, device);
+ mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
- ret = btrfs_add_device(trans, root, device);
+ ret = btrfs_add_device(trans, fs_info, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_trans;
@@ -2461,7 +2457,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
- ret = btrfs_finish_sprout(trans, root);
+ ret = btrfs_finish_sprout(trans, fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_trans;
@@ -2471,16 +2467,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
* so rename the fsid on the sysfs
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
- root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
- fsid_buf))
- btrfs_warn(root->fs_info,
- "sysfs: failed to create fsid for sprout");
+ fs_info->fsid);
+ if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
+ btrfs_warn(fs_info,
+ "sysfs: failed to create fsid for sprout");
}
- root->fs_info->num_tolerated_disk_barrier_failures =
- btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
- ret = btrfs_commit_transaction(trans, root);
+ fs_info->num_tolerated_disk_barrier_failures =
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+ ret = btrfs_commit_transaction(trans);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
@@ -2489,9 +2484,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (ret) /* transaction commit */
return ret;
- ret = btrfs_relocate_sys_chunks(root);
+ ret = btrfs_relocate_sys_chunks(fs_info);
if (ret < 0)
- btrfs_handle_fs_error(root->fs_info, ret,
+ btrfs_handle_fs_error(fs_info, ret,
"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
@@ -2499,7 +2494,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return 0;
return PTR_ERR(trans);
}
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
}
/* Update ctime/mtime for libblkid */
@@ -2507,9 +2502,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
return ret;
error_trans:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
rcu_string_free(device->name);
- btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2520,14 +2515,14 @@ error:
return ret;
}
-int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
@@ -2585,19 +2580,19 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
device->writeable = 1;
device->generation = 0;
- device->io_width = root->sectorsize;
- device->io_align = root->sectorsize;
- device->sector_size = root->sectorsize;
+ device->io_width = fs_info->sectorsize;
+ device->io_align = fs_info->sectorsize;
+ device->sector_size = fs_info->sectorsize;
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
ASSERT(list_empty(&srcdev->resized_list));
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
- device->dev_root = fs_info->dev_root;
+ device->fs_info = fs_info;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 1;
@@ -2608,7 +2603,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
*device_out = device;
return ret;
@@ -2621,11 +2616,13 @@ error:
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
+ u32 sectorsize = fs_info->sectorsize;
+
WARN_ON(fs_info->fs_devices->rw_devices == 0);
- tgtdev->io_width = fs_info->dev_root->sectorsize;
- tgtdev->io_align = fs_info->dev_root->sectorsize;
- tgtdev->sector_size = fs_info->dev_root->sectorsize;
- tgtdev->dev_root = fs_info->dev_root;
+ tgtdev->io_width = sectorsize;
+ tgtdev->io_align = sectorsize;
+ tgtdev->sector_size = sectorsize;
+ tgtdev->fs_info = fs_info;
tgtdev->in_fs_metadata = 1;
}
@@ -2634,13 +2631,11 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_path *path;
- struct btrfs_root *root;
+ struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
- root = device->dev_root->fs_info->chunk_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2680,8 +2675,8 @@ out:
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
- struct btrfs_super_block *super_copy =
- device->dev_root->fs_info->super_copy;
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_fs_devices *fs_devices;
u64 old_total;
u64 diff;
@@ -2689,41 +2684,41 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
if (!device->writeable)
return -EACCES;
- lock_chunks(device->dev_root);
+ mutex_lock(&fs_info->chunk_mutex);
old_total = btrfs_super_total_bytes(super_copy);
diff = new_size - device->total_bytes;
if (new_size <= device->total_bytes ||
device->is_tgtdev_for_dev_replace) {
- unlock_chunks(device->dev_root);
+ mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
- fs_devices = device->dev_root->fs_info->fs_devices;
+ fs_devices = fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
- btrfs_clear_space_info_full(device->dev_root->fs_info);
+ btrfs_clear_space_info_full(device->fs_info);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
&fs_devices->resized_devices);
- unlock_chunks(device->dev_root);
+ mutex_unlock(&fs_info->chunk_mutex);
return btrfs_update_device(trans, device);
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 chunk_objectid,
+ struct btrfs_fs_info *fs_info, u64 chunk_objectid,
u64 chunk_offset)
{
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2736,25 +2731,25 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_handle_fs_error(root->fs_info, -ENOENT,
- "Failed lookup while freeing chunk.");
+ btrfs_handle_fs_error(fs_info, -ENOENT,
+ "Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_handle_fs_error(root->fs_info, ret,
- "Failed to delete chunk item.");
+ btrfs_handle_fs_error(fs_info, ret,
+ "Failed to delete chunk item.");
out:
btrfs_free_path(path);
return ret;
}
-static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
- chunk_offset)
+static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
+ u64 chunk_objectid, u64 chunk_offset)
{
- struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
@@ -2765,7 +2760,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
u32 cur;
struct btrfs_key key;
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2795,25 +2790,22 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
cur += len;
}
}
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 chunk_offset)
+ struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
int i, ret = 0;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- /* Just in case */
- root = root->fs_info->chunk_root;
- em_tree = &root->fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
@@ -2832,9 +2824,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
return -EINVAL;
}
map = em->map_lookup;
- lock_chunks(root->fs_info->chunk_root);
- check_system_chunk(trans, extent_root, map->type);
- unlock_chunks(root->fs_info->chunk_root);
+ mutex_lock(&fs_info->chunk_mutex);
+ check_system_chunk(trans, fs_info, map->type);
+ mutex_unlock(&fs_info->chunk_mutex);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2854,14 +2846,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
if (device->bytes_used > 0) {
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += dev_extent_len;
- spin_unlock(&root->fs_info->free_chunk_lock);
- btrfs_clear_space_info_full(root->fs_info);
- unlock_chunks(root);
+ spin_lock(&fs_info->free_chunk_lock);
+ fs_info->free_chunk_space += dev_extent_len;
+ spin_unlock(&fs_info->free_chunk_lock);
+ btrfs_clear_space_info_full(fs_info);
+ mutex_unlock(&fs_info->chunk_mutex);
}
if (map->stripes[i].dev) {
@@ -2875,23 +2867,24 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
+ ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+ ret = btrfs_del_sys_chunk(fs_info, chunk_objectid,
+ chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
- ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2903,15 +2896,12 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct btrfs_root *extent_root;
+ struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
int ret;
- root = root->fs_info->chunk_root;
- extent_root = root->fs_info->extent_root;
-
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -2924,16 +2914,16 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+ ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
- ret = btrfs_can_relocate(extent_root, chunk_offset);
+ ret = btrfs_can_relocate(fs_info, chunk_offset);
if (ret)
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
- btrfs_scrub_pause(root);
- ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- btrfs_scrub_continue(root);
+ btrfs_scrub_pause(fs_info);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ btrfs_scrub_continue(fs_info);
if (ret)
return ret;
@@ -2949,14 +2939,14 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
* step two, delete the device extents and the
* chunk tree entries
*/
- ret = btrfs_remove_chunk(trans, root, chunk_offset);
- btrfs_end_transaction(trans, extent_root);
+ ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
+ btrfs_end_transaction(trans);
return ret;
}
-static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
@@ -2977,10 +2967,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@@ -2988,7 +2978,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
@@ -3003,14 +2993,13 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(chunk_root,
- found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset);
if (ret == -ENOSPC)
failed++;
else
BUG_ON(ret);
}
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
@@ -3029,9 +3018,10 @@ error:
return ret;
}
-static int insert_balance_item(struct btrfs_root *root,
+static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
@@ -3062,7 +3052,7 @@ static int insert_balance_item(struct btrfs_root *root,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+ memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
btrfs_set_balance_data(leaf, item, &disk_bargs);
@@ -3076,14 +3066,15 @@ static int insert_balance_item(struct btrfs_root *root,
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
- err = btrfs_commit_transaction(trans, root);
+ err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
}
-static int del_balance_item(struct btrfs_root *root)
+static int del_balance_item(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
@@ -3114,7 +3105,7 @@ static int del_balance_item(struct btrfs_root *root)
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
- err = btrfs_commit_transaction(trans, root);
+ err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
@@ -3369,11 +3360,11 @@ static int chunk_soft_convert_filter(u64 chunk_type,
return 0;
}
-static int should_balance_chunk(struct btrfs_root *root,
+static int should_balance_chunk(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
- struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
@@ -3398,10 +3389,10 @@ static int should_balance_chunk(struct btrfs_root *root,
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
- chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ chunk_usage_filter(fs_info, chunk_offset, bargs)) {
return 0;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
- chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+ chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
return 0;
}
@@ -3521,7 +3512,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_grow_device(trans, device, old_size);
if (ret) {
- btrfs_end_transaction(trans, dev_root);
+ btrfs_end_transaction(trans);
/* btrfs_grow_device never returns ret > 0 */
WARN_ON(ret > 0);
btrfs_info_in_rcu(fs_info,
@@ -3531,7 +3522,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
goto error;
}
- btrfs_end_transaction(trans, dev_root);
+ btrfs_end_transaction(trans);
}
/* step two, relocate all the chunks */
@@ -3606,7 +3597,7 @@ again:
spin_unlock(&fs_info->balance_lock);
}
- ret = should_balance_chunk(chunk_root, leaf, chunk,
+ ret = should_balance_chunk(fs_info, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
@@ -3659,9 +3650,9 @@ again:
goto error;
}
- ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ ret = btrfs_force_chunk_alloc(trans, fs_info,
BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans, chunk_root);
+ btrfs_end_transaction(trans);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
@@ -3669,8 +3660,7 @@ again:
chunk_reserved = 1;
}
- ret = btrfs_relocate_chunk(chunk_root,
- found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto error;
@@ -3741,7 +3731,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
int ret;
unset_balance_control(fs_info);
- ret = del_balance_item(fs_info->tree_root);
+ ret = del_balance_item(fs_info);
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
@@ -3874,7 +3864,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
bctl->sys.target));
}
- ret = insert_balance_item(fs_info->tree_root, bctl);
+ ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -4166,7 +4156,7 @@ static int btrfs_uuid_scan_kthread(void *data)
}
update_tree:
if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ ret = btrfs_uuid_tree_add(trans, fs_info,
root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
@@ -4178,7 +4168,7 @@ update_tree:
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+ ret = btrfs_uuid_tree_add(trans, fs_info,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
@@ -4191,7 +4181,7 @@ update_tree:
skip:
if (trans) {
- ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+ ret = btrfs_end_transaction(trans);
trans = NULL;
if (ret)
break;
@@ -4216,7 +4206,7 @@ skip:
out:
btrfs_free_path(path);
if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans, fs_info->uuid_root);
+ btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
else
@@ -4310,13 +4300,13 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction(trans);
return ret;
}
fs_info->uuid_root = uuid_root;
- ret = btrfs_commit_transaction(trans, tree_root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
@@ -4355,8 +4345,9 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
*/
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
- struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
@@ -4368,7 +4359,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
- struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff = old_size - new_size;
@@ -4382,16 +4373,16 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
path->reada = READA_FORWARD;
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space -= diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
+ spin_lock(&fs_info->free_chunk_lock);
+ fs_info->free_chunk_space -= diff;
+ spin_unlock(&fs_info->free_chunk_lock);
}
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
again:
key.objectid = device->devid;
@@ -4399,16 +4390,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret)
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
@@ -4422,7 +4413,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4431,7 +4422,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4439,8 +4430,8 @@ again:
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_offset);
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -4463,7 +4454,7 @@ again:
goto done;
}
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
/*
* We checked in the above loop all device extents that were already in
@@ -4483,11 +4474,11 @@ again:
if (contains_pending_extent(trans->transaction, device,
&start, len)) {
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
checked_pending_chunks = true;
failed = 0;
retried = false;
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans);
if (ret)
goto done;
goto again;
@@ -4497,44 +4488,44 @@ again:
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
- &root->fs_info->fs_devices->resized_devices);
+ &fs_info->fs_devices->resized_devices);
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
done:
btrfs_free_path(path);
if (ret) {
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
+ spin_lock(&fs_info->free_chunk_lock);
+ fs_info->free_chunk_space += diff;
+ spin_unlock(&fs_info->free_chunk_lock);
+ mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
}
-static int btrfs_add_system_chunk(struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
- struct btrfs_super_block *super_copy = root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
return -EFBIG;
}
@@ -4545,7 +4536,7 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
return 0;
}
@@ -4583,7 +4574,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \
+#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -4593,10 +4584,10 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
/ sizeof(struct btrfs_stripe) + 1)
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 start,
+ struct btrfs_fs_info *fs_info, u64 start,
u64 type)
{
- struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct list_head *cur;
struct map_lookup *map = NULL;
@@ -4762,12 +4753,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_RAID5) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
- extent_root->stripesize);
+ info->stripesize);
data_stripes = num_stripes - 1;
}
if (type & BTRFS_BLOCK_GROUP_RAID6) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
- extent_root->stripesize);
+ info->stripesize);
data_stripes = num_stripes - 2;
}
@@ -4812,7 +4803,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
j * stripe_size;
}
}
- map->sector_size = extent_root->sectorsize;
+ map->sector_size = info->sectorsize;
map->stripe_len = raid_stripe_len;
map->io_align = raid_stripe_len;
map->io_width = raid_stripe_len;
@@ -4821,7 +4812,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
num_bytes = stripe_size * data_stripes;
- trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+ trace_btrfs_chunk_alloc(info, map, start, num_bytes);
em = alloc_extent_map();
if (!em) {
@@ -4837,7 +4828,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->block_len = em->len;
em->orig_block_len = stripe_size;
- em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ em_tree = &info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
@@ -4850,7 +4841,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
goto error;
}
- ret = btrfs_make_block_group(trans, extent_root, 0, type,
+ ret = btrfs_make_block_group(trans, info, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
if (ret)
@@ -4861,13 +4852,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
}
- spin_lock(&extent_root->fs_info->free_chunk_lock);
- extent_root->fs_info->free_chunk_space -= (stripe_size *
- map->num_stripes);
- spin_unlock(&extent_root->fs_info->free_chunk_lock);
+ spin_lock(&info->free_chunk_lock);
+ info->free_chunk_space -= (stripe_size * map->num_stripes);
+ spin_unlock(&info->free_chunk_lock);
free_extent_map(em);
- check_raid56_incompat_flag(extent_root->fs_info, type);
+ check_raid56_incompat_flag(info, type);
kfree(devices_info);
return 0;
@@ -4889,11 +4879,12 @@ error:
}
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
+ struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 chunk_size)
{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
- struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
@@ -4906,20 +4897,19 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+ em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
read_unlock(&em_tree->lock);
if (!em) {
- btrfs_crit(extent_root->fs_info,
- "unable to find logical %Lu len %Lu",
+ btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
chunk_offset, chunk_size);
return -EINVAL;
}
if (em->start != chunk_offset || em->len != chunk_size) {
- btrfs_crit(extent_root->fs_info,
+ btrfs_crit(fs_info,
"found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
chunk_offset, chunk_size, em->start, em->len);
free_extent_map(em);
@@ -4943,7 +4933,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
* at any time during that final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()).
*/
- mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
@@ -4960,7 +4950,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
break;
}
if (ret) {
- mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
goto out;
}
@@ -4974,7 +4964,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
- mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -4983,7 +4973,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
- btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+ btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
@@ -4996,8 +4986,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
* TODO: Cleanup of inserted chunk root in case of
* failure.
*/
- ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
- item_size);
+ ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
}
out:
@@ -5014,36 +5003,34 @@ out:
* bootstrap process of adding storage to a seed btrfs.
*/
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 type)
+ struct btrfs_fs_info *fs_info, u64 type)
{
u64 chunk_offset;
- ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
- chunk_offset = find_next_chunk(extent_root->fs_info);
- return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
+ ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ chunk_offset = find_next_chunk(fs_info);
+ return __btrfs_alloc_chunk(trans, fs_info, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
+ struct btrfs_root *extent_root = fs_info->extent_root;
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
- alloc_profile);
+ ret = __btrfs_alloc_chunk(trans, fs_info, chunk_offset, alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = find_next_chunk(root->fs_info);
+ sys_chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
- ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+ ret = __btrfs_alloc_chunk(trans, fs_info, sys_chunk_offset,
alloc_profile);
return ret;
}
@@ -5066,11 +5053,11 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return max_errors;
}
-int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
@@ -5182,14 +5169,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
-unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct btrfs_mapping_tree *map_tree,
u64 logical)
{
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
- unsigned long len = root->sectorsize;
+ unsigned long len = fs_info->sectorsize;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
@@ -5329,7 +5316,8 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
kfree(bbio);
}
-static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map)
@@ -5414,7 +5402,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
raid56_full_stripe_start *= full_stripe_len;
}
- if (op == REQ_OP_DISCARD) {
+ if (op == BTRFS_MAP_DISCARD) {
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
@@ -5427,7 +5415,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- (op == REQ_OP_WRITE)) {
+ (op == BTRFS_MAP_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
} else {
@@ -5452,8 +5440,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
- op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
+ op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
+ op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
/*
* in dev-replace case, for repair case (that's the only
* case where the mirror is selected explicitly when
@@ -5474,7 +5462,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
int found = 0;
u64 physical_of_found = 0;
- ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
logical, &tmp_length, &tmp_bbio, 0, 0);
if (ret) {
WARN_ON(tmp_bbio != NULL);
@@ -5484,7 +5472,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
tmp_num_stripes = tmp_bbio->num_stripes;
if (mirror_num > tmp_num_stripes) {
/*
- * REQ_GET_READ_MIRRORS does not contain this
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this
* mirror, that means that the requested area
* is not left of the left cursor
*/
@@ -5540,17 +5528,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
(offset + *length);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (op == REQ_OP_DISCARD)
+ if (op == BTRFS_MAP_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
- op != REQ_GET_READ_MIRRORS)
+ if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
+ op != BTRFS_MAP_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
- op == REQ_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
+ op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5563,8 +5551,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
- op == REQ_GET_READ_MIRRORS) {
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
+ op == BTRFS_MAP_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5578,9 +5566,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
- if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (op == REQ_OP_DISCARD)
+ else if (op == BTRFS_MAP_DISCARD)
num_stripes = min_t(u64, map->sub_stripes *
(stripe_nr_end - stripe_nr_orig),
map->num_stripes);
@@ -5598,7 +5586,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
if (need_raid_map &&
- (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS ||
+ (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div_u64(raid56_full_stripe_start,
@@ -5626,8 +5614,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD &&
- op != REQ_GET_READ_MIRRORS) && mirror_num <= 1)
+ if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
+ op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5650,9 +5638,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing) {
- if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
num_alloc_stripes <<= 1;
- if (op == REQ_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
tgtdev_indexes = num_stripes;
}
@@ -5668,7 +5656,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
need_raid_map &&
- ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) ||
+ ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5693,7 +5681,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
RAID6_Q_STRIPE;
}
- if (op == REQ_OP_DISCARD) {
+ if (op == BTRFS_MAP_DISCARD) {
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
@@ -5773,7 +5761,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
}
}
- if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
@@ -5781,7 +5769,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
tgtdev_indexes = 0;
if (dev_replace_is_ongoing &&
- (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) &&
+ (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
dev_replace->tgtdev != NULL) {
int index_where_to_add;
u64 srcdev_devid = dev_replace->srcdev->devid;
@@ -5816,7 +5804,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
}
}
num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) &&
+ } else if (dev_replace_is_ongoing &&
+ op == BTRFS_MAP_GET_READ_MIRRORS &&
dev_replace->tgtdev != NULL) {
u64 srcdev_devid = dev_replace->srcdev->devid;
int index_srcdev = 0;
@@ -5888,7 +5877,7 @@ out:
return ret;
}
-int btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
@@ -5897,7 +5886,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
}
/* For Scrub/replace */
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op,
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
int need_raid_map)
@@ -6023,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio)
else
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_READ_ERRS);
- if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
+ if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
btrfs_dev_stat_print_on_error(dev);
@@ -6069,10 +6058,10 @@ static void btrfs_end_bio(struct bio *bio)
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
-static noinline void btrfs_schedule_bio(struct btrfs_root *root,
- struct btrfs_device *device,
+static noinline void btrfs_schedule_bio(struct btrfs_device *device,
struct bio *bio)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
@@ -6095,12 +6084,12 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
* made progress against dirty pages when we've really just put it
* on a queue for later
*/
- atomic_inc(&root->fs_info->nr_async_bios);
+ atomic_inc(&fs_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
spin_lock(&device->io_lock);
- if (bio->bi_opf & REQ_SYNC)
+ if (op_is_sync(bio->bi_opf))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
@@ -6117,15 +6106,14 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
spin_unlock(&device->io_lock);
if (should_queue)
- btrfs_queue_work(root->fs_info->submit_workers,
- &device->work);
+ btrfs_queue_work(fs_info->submit_workers, &device->work);
}
-static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
- struct bio *bio, u64 physical, int dev_nr,
- int async)
+static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+ u64 physical, int dev_nr, int async)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
@@ -6148,10 +6136,10 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
#endif
bio->bi_bdev = dev->bdev;
- btrfs_bio_counter_inc_noblocked(root->fs_info);
+ btrfs_bio_counter_inc_noblocked(fs_info);
if (async)
- btrfs_schedule_bio(root, dev, bio);
+ btrfs_schedule_bio(dev, bio);
else
btrfsic_submit_bio(bio);
}
@@ -6170,7 +6158,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
}
}
-int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
+int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, int async_submit)
{
struct btrfs_device *dev;
@@ -6186,11 +6174,11 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
length = bio->bi_iter.bi_size;
map_length = length;
- btrfs_bio_counter_inc_blocked(root->fs_info);
- ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical,
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = __btrfs_map_block(fs_info, bio_op(bio), logical,
&map_length, &bbio, mirror_num, 1);
if (ret) {
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
return ret;
}
@@ -6198,7 +6186,7 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = root->fs_info;
+ bbio->fs_info = fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
@@ -6206,18 +6194,19 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (bio_op(bio) == REQ_OP_WRITE) {
- ret = raid56_parity_write(root, bio, bbio, map_length);
+ ret = raid56_parity_write(fs_info, bio, bbio,
+ map_length);
} else {
- ret = raid56_parity_recover(root, bio, bbio, map_length,
- mirror_num, 1);
+ ret = raid56_parity_recover(fs_info, bio, bbio,
+ map_length, mirror_num, 1);
}
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
return ret;
}
if (map_length < length) {
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"mapping failed logical %llu bio len %llu len %llu",
logical, length, map_length);
BUG();
@@ -6237,11 +6226,10 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
} else
bio = first_bio;
- submit_stripe_bio(root, bbio, bio,
- bbio->stripes[dev_nr].physical, dev_nr,
- async_submit);
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
+ dev_nr, async_submit);
}
- btrfs_bio_counter_dec(root->fs_info);
+ btrfs_bio_counter_dec(fs_info);
return 0;
}
@@ -6265,8 +6253,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
return NULL;
}
-static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
- struct btrfs_fs_devices *fs_devices,
+static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
@@ -6337,7 +6324,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
}
/* Return -EIO if any error, otherwise return 0. */
-static int btrfs_check_chunk_valid(struct btrfs_root *root,
+static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical)
{
@@ -6354,33 +6341,31 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
type = btrfs_chunk_type(leaf, chunk);
if (!num_stripes) {
- btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+ btrfs_err(fs_info, "invalid chunk num_stripes: %u",
num_stripes);
return -EIO;
}
- if (!IS_ALIGNED(logical, root->sectorsize)) {
- btrfs_err(root->fs_info,
- "invalid chunk logical %llu", logical);
+ if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "invalid chunk logical %llu", logical);
return -EIO;
}
- if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) {
- btrfs_err(root->fs_info, "invalid chunk sectorsize %u",
+ if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+ btrfs_err(fs_info, "invalid chunk sectorsize %u",
btrfs_chunk_sector_size(leaf, chunk));
return -EIO;
}
- if (!length || !IS_ALIGNED(length, root->sectorsize)) {
- btrfs_err(root->fs_info,
- "invalid chunk length %llu", length);
+ if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "invalid chunk length %llu", length);
return -EIO;
}
if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
- btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+ btrfs_err(fs_info, "invalid chunk stripe length: %llu",
stripe_len);
return -EIO;
}
if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
type) {
- btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+ btrfs_err(fs_info, "unrecognized chunk type: %llu",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK) &
btrfs_chunk_type(leaf, chunk));
@@ -6393,7 +6378,7 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
(type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
- btrfs_err(root->fs_info,
+ btrfs_err(fs_info,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -6403,11 +6388,11 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root,
return 0;
}
-static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
- struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
@@ -6424,7 +6409,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- ret = btrfs_check_chunk_valid(root, leaf, chunk, logical);
+ ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
if (ret)
return ret;
@@ -6471,23 +6456,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+ map->stripes[i].dev = btrfs_find_device(fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev &&
- !btrfs_test_opt(root->fs_info, DEGRADED)) {
+ !btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
return -EIO;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
- add_missing_dev(root, root->fs_info->fs_devices,
- devid, uuid);
+ add_missing_dev(fs_info->fs_devices, devid,
+ uuid);
if (!map->stripes[i].dev) {
free_extent_map(em);
return -EIO;
}
- btrfs_warn(root->fs_info,
- "devid %llu uuid %pU is missing",
+ btrfs_warn(fs_info, "devid %llu uuid %pU is missing",
devid, uuid);
}
map->stripes[i].dev->in_fs_metadata = 1;
@@ -6525,7 +6509,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
-static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
+static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
@@ -6533,7 +6517,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
BUG_ON(!mutex_is_locked(&uuid_mutex));
- fs_devices = root->fs_info->fs_devices->seed;
+ fs_devices = fs_info->fs_devices->seed;
while (fs_devices) {
if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
return fs_devices;
@@ -6543,7 +6527,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- if (!btrfs_test_opt(root->fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
fs_devices = alloc_fs_devices(fsid);
@@ -6560,7 +6544,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
return fs_devices;
ret = __btrfs_open_devices(fs_devices, FMODE_READ,
- root->fs_info->bdev_holder);
+ fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(ret);
@@ -6574,17 +6558,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
goto out;
}
- fs_devices->seed = root->fs_info->fs_devices->seed;
- root->fs_info->fs_devices->seed = fs_devices;
+ fs_devices->seed = fs_info->fs_devices->seed;
+ fs_info->fs_devices->seed = fs_devices;
out:
return fs_devices;
}
-static int read_one_dev(struct btrfs_root *root,
+static int read_one_dev(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
@@ -6597,24 +6581,24 @@ static int read_one_dev(struct btrfs_root *root,
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
- if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
- fs_devices = open_seed_devices(root, fs_uuid);
+ if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) {
+ fs_devices = open_seed_devices(fs_info, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
- if (!btrfs_test_opt(root->fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED))
return -EIO;
- device = add_missing_dev(root, fs_devices, devid, dev_uuid);
+ device = add_missing_dev(fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
- btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+ btrfs_warn(fs_info, "devid %llu uuid %pU missing",
devid, dev_uuid);
} else {
- if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED))
+ if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED))
return -EIO;
if(!device->bdev && !device->missing) {
@@ -6643,7 +6627,7 @@ static int read_one_dev(struct btrfs_root *root,
}
}
- if (device->fs_devices != root->fs_info->fs_devices) {
+ if (device->fs_devices != fs_info->fs_devices) {
BUG_ON(device->writeable);
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
@@ -6654,18 +6638,18 @@ static int read_one_dev(struct btrfs_root *root,
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += device->total_bytes -
+ spin_lock(&fs_info->free_chunk_lock);
+ fs_info->free_chunk_space += device->total_bytes -
device->bytes_used;
- spin_unlock(&root->fs_info->free_chunk_lock);
+ spin_unlock(&fs_info->free_chunk_lock);
}
ret = 0;
return ret;
}
-int btrfs_read_sys_array(struct btrfs_root *root)
+int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@@ -6680,13 +6664,13 @@ int btrfs_read_sys_array(struct btrfs_root *root)
u64 type;
struct btrfs_key key;
- ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
/*
* This will create extent buffer of nodesize, superblock size is
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
- sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
+ sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
@@ -6757,7 +6741,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (cur_offset + len > array_size)
goto out_short_read;
- ret = read_one_chunk(root, &key, sb, chunk);
+ ret = read_one_chunk(fs_info, &key, sb, chunk);
if (ret)
break;
} else {
@@ -6783,8 +6767,9 @@ out_short_read:
return -EIO;
}
-int btrfs_read_chunk_tree(struct btrfs_root *root)
+int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -6793,14 +6778,12 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
int slot;
u64 total_dev = 0;
- root = root->fs_info->chunk_root;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
mutex_lock(&uuid_mutex);
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
/*
* Read all device items, and then all the chunk items. All
@@ -6830,14 +6813,14 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
struct btrfs_dev_item);
- ret = read_one_dev(root, leaf, dev_item);
+ ret = read_one_dev(fs_info, leaf, dev_item);
if (ret)
goto error;
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- ret = read_one_chunk(root, &found_key, leaf, chunk);
+ ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
if (ret)
goto error;
}
@@ -6848,26 +6831,26 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
* After loading chunk tree, we've got all device information,
* do another round of validation checks.
*/
- if (total_dev != root->fs_info->fs_devices->total_devices) {
- btrfs_err(root->fs_info,
+ if (total_dev != fs_info->fs_devices->total_devices) {
+ btrfs_err(fs_info,
"super_num_devices %llu mismatch with num_devices %llu found here",
- btrfs_super_num_devices(root->fs_info->super_copy),
+ btrfs_super_num_devices(fs_info->super_copy),
total_dev);
ret = -EINVAL;
goto error;
}
- if (btrfs_super_total_bytes(root->fs_info->super_copy) <
- root->fs_info->fs_devices->total_rw_bytes) {
- btrfs_err(root->fs_info,
+ if (btrfs_super_total_bytes(fs_info->super_copy) <
+ fs_info->fs_devices->total_rw_bytes) {
+ btrfs_err(fs_info,
"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
- btrfs_super_total_bytes(root->fs_info->super_copy),
- root->fs_info->fs_devices->total_rw_bytes);
+ btrfs_super_total_bytes(fs_info->super_copy),
+ fs_info->fs_devices->total_rw_bytes);
ret = -EINVAL;
goto error;
}
ret = 0;
error:
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
@@ -6882,7 +6865,7 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
while (fs_devices) {
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
- device->dev_root = fs_info->dev_root;
+ device->fs_info = fs_info;
mutex_unlock(&fs_devices->device_list_mutex);
fs_devices = fs_devices->seed;
@@ -6959,9 +6942,10 @@ out:
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *dev_root,
+ struct btrfs_fs_info *fs_info,
struct btrfs_device *device)
{
+ struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
@@ -6977,7 +6961,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(dev_root->fs_info,
+ btrfs_warn_in_rcu(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
@@ -6988,7 +6972,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(dev_root->fs_info,
+ btrfs_warn_in_rcu(fs_info,
"delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
@@ -7002,7 +6986,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(dev_root->fs_info,
+ btrfs_warn_in_rcu(fs_info,
"insert dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
@@ -7027,7 +7011,6 @@ out:
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int stats_cnt;
@@ -7039,7 +7022,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
continue;
stats_cnt = atomic_read(&device->dev_stats_ccnt);
- ret = update_dev_stat_item(trans, dev_root, device);
+ ret = update_dev_stat_item(trans, fs_info, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
@@ -7058,7 +7041,7 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+ btrfs_err_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7078,7 +7061,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->dev_root->fs_info,
+ btrfs_info_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7088,24 +7071,22 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
-int btrfs_get_dev_stats(struct btrfs_root *root,
+int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
struct btrfs_device *dev;
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
- btrfs_warn(root->fs_info,
- "get dev_stats failed, device not found");
+ btrfs_warn(fs_info, "get dev_stats failed, device not found");
return -ENODEV;
} else if (!dev->dev_stats_valid) {
- btrfs_warn(root->fs_info,
- "get dev_stats failed, not yet valid");
+ btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
return -ENODEV;
} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
@@ -7168,18 +7149,18 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
return;
mutex_lock(&fs_devices->device_list_mutex);
- lock_chunks(fs_info->dev_root);
+ mutex_lock(&fs_info->chunk_mutex);
list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
resized_list) {
list_del_init(&curr->resized_list);
curr->commit_total_bytes = curr->disk_total_bytes;
}
- unlock_chunks(fs_info->dev_root);
+ mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_devices->device_list_mutex);
}
/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *transaction)
{
struct extent_map *em;
@@ -7191,7 +7172,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
return;
/* In order to kick the device replace finish process */
- lock_chunks(root);
+ mutex_lock(&fs_info->chunk_mutex);
list_for_each_entry(em, &transaction->pending_chunks, list) {
map = em->map_lookup;
@@ -7200,7 +7181,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
dev->commit_bytes_used = dev->bytes_used;
}
}
- unlock_chunks(root);
+ mutex_unlock(&fs_info->chunk_mutex);
}
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 09ed29c67848..24ba6bc3ec34 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -51,8 +51,7 @@ struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices;
-
- struct btrfs_root *dev_root;
+ struct btrfs_fs_info *fs_info;
struct rcu_string *name;
@@ -62,7 +61,7 @@ struct btrfs_device {
int running_pending;
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
- /* WRITE_SYNC bios */
+ /* sync bios */
struct btrfs_pending_bios pending_sync_bios;
struct block_device *bdev;
@@ -371,27 +370,48 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+enum btrfs_map_op {
+ BTRFS_MAP_READ,
+ BTRFS_MAP_WRITE,
+ BTRFS_MAP_DISCARD,
+ BTRFS_MAP_GET_READ_MIRRORS,
+};
+
+static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+{
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ return BTRFS_MAP_DISCARD;
+ case REQ_OP_WRITE:
+ return BTRFS_MAP_WRITE;
+ default:
+ WARN_ON_ONCE(1);
+ case REQ_OP_READ:
+ return BTRFS_MAP_READ;
+ }
+}
+
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
void btrfs_get_bbio(struct btrfs_bio *bbio);
void btrfs_put_bbio(struct btrfs_bio *bbio);
-int btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
-int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op,
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
int need_raid_map);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
-int btrfs_read_sys_array(struct btrfs_root *root);
-int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
+int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, u64 type);
+ struct btrfs_fs_info *fs_info, u64 type);
void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_root *root, struct bio *bio,
+int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, int async_submit);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
@@ -401,16 +421,17 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
struct btrfs_device *device, struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
char *device_path,
struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
+int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
char *devpath,
struct btrfs_device **device);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
-int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid);
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ char *device_path, u64 devid);
void btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
@@ -418,8 +439,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
-int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+ char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out);
int btrfs_balance(struct btrfs_balance_control *bctl,
@@ -430,7 +452,7 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent_start(struct btrfs_transaction *transaction,
struct btrfs_device *device, u64 num_bytes,
u64 search_start, u64 *start, u64 *max_avail);
@@ -438,7 +460,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
-int btrfs_get_dev_stats(struct btrfs_root *root,
+int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
@@ -455,14 +477,14 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
-unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct btrfs_mapping_tree *map_tree,
u64 logical);
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
+ struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 chunk_size);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 chunk_offset);
+ struct btrfs_fs_info *fs_info, u64 chunk_offset);
static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
{
@@ -509,19 +531,9 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
}
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
+void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *transaction);
-static inline void lock_chunks(struct btrfs_root *root)
-{
- mutex_lock(&root->fs_info->chunk_mutex);
-}
-
-static inline void unlock_chunks(struct btrfs_root *root)
-{
- mutex_unlock(&root->fs_info->chunk_mutex);
-}
-
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fccbf5567e78..9621c7f2503e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -94,11 +94,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
size_t name_len = strlen(name);
int ret = 0;
- if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
return -ENOSPC;
path = btrfs_alloc_path();
@@ -149,14 +150,14 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
*/
ret = 0;
btrfs_assert_tree_locked(path->nodes[0]);
- di = btrfs_match_dir_item_name(root, path, name, name_len);
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
goto out;
}
} else if (ret == -EEXIST) {
ret = 0;
- di = btrfs_match_dir_item_name(root, path, name, name_len);
+ di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
ASSERT(di); /* logic error */
} else if (ret) {
goto out;
@@ -185,7 +186,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
char *ptr;
if (size > old_data_len) {
- if (btrfs_leaf_free_space(root, leaf) <
+ if (btrfs_leaf_free_space(fs_info, leaf) <
(size - old_data_len)) {
ret = -ENOSPC;
goto out;
@@ -195,16 +196,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
if (old_data_len + name_len + sizeof(*di) == item_size) {
/* No other xattrs packed in the same leaf item. */
if (size > old_data_len)
- btrfs_extend_item(root, path,
+ btrfs_extend_item(fs_info, path,
size - old_data_len);
else if (size < old_data_len)
- btrfs_truncate_item(root, path, data_size, 1);
+ btrfs_truncate_item(fs_info, path,
+ data_size, 1);
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto out;
- btrfs_extend_item(root, path, data_size);
+ btrfs_extend_item(fs_info, path, data_size);
}
item = btrfs_item_nr(slot);
@@ -257,7 +259,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans);
return ret;
}
@@ -265,6 +267,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret = 0;
@@ -333,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
- if (verify_dir_item(root, leaf, di)) {
+ if (verify_dir_item(fs_info, leaf, di)) {
ret = -EIO;
goto err;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 441b81a3e545..da497f184ff4 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -210,10 +210,9 @@ out:
return ret;
}
-static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
u64 disk_start,
- struct bio_vec *bvec,
- int vcnt,
+ struct bio *orig_bio,
size_t srclen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -222,10 +221,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
char *data_in;
size_t total_out = 0;
unsigned long page_in_index = 0;
- unsigned long page_out_index = 0;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
- unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
@@ -235,7 +232,6 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
workspace->strm.avail_out = PAGE_SIZE;
- pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -250,6 +246,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
+ kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -266,8 +263,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
total_out, disk_start,
- bvec, vcnt,
- &page_out_index, &pg_offset);
+ orig_bio);
if (ret2 == 0) {
ret = 0;
goto done;
@@ -300,7 +296,7 @@ done:
if (data_in)
kunmap(pages_in[page_in_index]);
if (!ret)
- btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset);
+ zero_fill_bio(orig_bio);
return ret;
}
@@ -407,6 +403,6 @@ const struct btrfs_compress_op btrfs_zlib_compress = {
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
- .decompress_biovec = zlib_decompress_biovec,
+ .decompress_bio = zlib_decompress_bio,
.decompress = zlib_decompress,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001d..0e87401cf335 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -43,6 +43,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -753,7 +754,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* still in flight on potentially older
* contents.
*/
- write_dirty_buffer(bh, WRITE_SYNC);
+ write_dirty_buffer(bh, REQ_SYNC);
/*
* Kick off IO for the previous mapping. Note
@@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page,
}
EXPORT_SYMBOL(create_empty_buffers);
-/*
- * We are taking a block for data and we don't want any output from any
- * buffer-cache aliases starting from return from that function and
- * until the moment when something will explicitly mark the buffer
- * dirty (hopefully that will not happen until we will free that block ;-)
- * We don't even need to mark it not-uptodate - nobody can expect
- * anything from a newly allocated buffer anyway. We used to used
- * unmap_buffer() for such invalidation, but that was wrong. We definitely
- * don't want to mark the alias unmapped, for example - it would confuse
- * anyone who might pick it with bread() afterwards...
- *
- * Also.. Note that bforget() doesn't lock the buffer. So there can
- * be writeout I/O going on against recently-freed buffers. We don't
- * wait on that I/O in bforget() - it's more efficient to wait on the I/O
- * only if we really need to. That happens here.
- */
-void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+/**
+ * clean_bdev_aliases: clean a range of buffers in block device
+ * @bdev: Block device to clean buffers in
+ * @block: Start of a range of blocks to clean
+ * @len: Number of blocks to clean
+ *
+ * We are taking a range of blocks for data and we don't want writeback of any
+ * buffer-cache aliases starting from return from this function and until the
+ * moment when something will explicitly mark the buffer dirty (hopefully that
+ * will not happen until we will free that block ;-) We don't even need to mark
+ * it not-uptodate - nobody can expect anything from a newly allocated buffer
+ * anyway. We used to use unmap_buffer() for such invalidation, but that was
+ * wrong. We definitely don't want to mark the alias unmapped, for example - it
+ * would confuse anyone who might pick it with bread() afterwards...
+ *
+ * Also.. Note that bforget() doesn't lock the buffer. So there can be
+ * writeout I/O going on against recently-freed buffers. We don't wait on that
+ * I/O in bforget() - it's more efficient to wait on the I/O only if we really
+ * need to. That happens here.
+ */
+void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
- struct buffer_head *old_bh;
+ struct inode *bd_inode = bdev->bd_inode;
+ struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct pagevec pvec;
+ pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pgoff_t end;
+ int i;
+ struct buffer_head *bh;
+ struct buffer_head *head;
- might_sleep();
+ end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pagevec_init(&pvec, 0);
+ while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- old_bh = __find_get_block_slow(bdev, block);
- if (old_bh) {
- clear_buffer_dirty(old_bh);
- wait_on_buffer(old_bh);
- clear_buffer_req(old_bh);
- __brelse(old_bh);
+ index = page->index;
+ if (index > end)
+ break;
+ if (!page_has_buffers(page))
+ continue;
+ /*
+ * We use page lock instead of bd_mapping->private_lock
+ * to pin buffers here since we can afford to sleep and
+ * it scales better than a global spinlock lock.
+ */
+ lock_page(page);
+ /* Recheck when the page is locked which pins bhs */
+ if (!page_has_buffers(page))
+ goto unlock_page;
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (!buffer_mapped(bh) || (bh->b_blocknr < block))
+ goto next;
+ if (bh->b_blocknr >= block + len)
+ break;
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ clear_buffer_req(bh);
+next:
+ bh = bh->b_this_page;
+ } while (bh != head);
+unlock_page:
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
}
}
-EXPORT_SYMBOL(unmap_underlying_metadata);
+EXPORT_SYMBOL(clean_bdev_aliases);
/*
* Size is a power-of-two in the range 512..PAGE_SIZE,
@@ -1684,7 +1728,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
* prevents this contention from occurring.
*
* If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
int __block_write_full_page(struct inode *inode, struct page *page,
@@ -1697,7 +1741,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
}
bh = bh->b_this_page;
@@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
}
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping,
if (!buffer_mapped(bh))
is_mapped_to_disk = 0;
if (buffer_new(bh))
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
set_buffer_uptodate(bh);
continue;
@@ -3118,7 +3160,7 @@ EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
* @op: whether to %READ or %WRITE
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
@@ -3210,7 +3252,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer);
int sync_dirty_buffer(struct buffer_head *bh)
{
- return __sync_dirty_buffer(bh, WRITE_SYNC);
+ return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);
@@ -3403,7 +3445,7 @@ void free_buffer_head(struct buffer_head *bh)
}
EXPORT_SYMBOL(free_buffer_head);
-static void buffer_exit_cpu(int cpu)
+static int buffer_exit_cpu_dead(unsigned int cpu)
{
int i;
struct bh_lru *b = &per_cpu(bh_lrus, cpu);
@@ -3414,14 +3456,7 @@ static void buffer_exit_cpu(int cpu)
}
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
-}
-
-static int buffer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
- buffer_exit_cpu((unsigned long)hcpu);
- return NOTIFY_OK;
+ return 0;
}
/**
@@ -3471,6 +3506,7 @@ EXPORT_SYMBOL(bh_submit_read);
void __init buffer_init(void)
{
unsigned long nrpages;
+ int ret;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
@@ -3483,5 +3519,7 @@ void __init buffer_init(void)
*/
nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
- hotcpu_notifier(buffer_cpu_notify, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
+ NULL, buffer_exit_cpu_dead);
+ WARN_ON(ret < 0);
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ef3ebd780aff..e4b066cd912a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -315,7 +315,32 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
struct page **pages;
pgoff_t next_index;
int nr_pages = 0;
- int ret;
+ int got = 0;
+ int ret = 0;
+
+ if (!current->journal_info) {
+ /* caller of readpages does not hold buffer and read caps
+ * (fadvise, madvise and readahead cases) */
+ int want = CEPH_CAP_FILE_CACHE;
+ ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ if (ret < 0) {
+ dout("start_read %p, error getting cap\n", inode);
+ } else if (!(got & want)) {
+ dout("start_read %p, no cache cap\n", inode);
+ ret = 0;
+ }
+ if (ret <= 0) {
+ if (got)
+ ceph_put_cap_refs(ci, got);
+ while (!list_empty(page_list)) {
+ page = list_entry(page_list->prev,
+ struct page, lru);
+ list_del(&page->lru);
+ put_page(page);
+ }
+ return ret;
+ }
+ }
off = (u64) page_offset(page);
@@ -338,15 +363,18 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
CEPH_OSD_FLAG_READ, NULL,
ci->i_truncate_seq, ci->i_truncate_size,
false);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
/* build page vector */
nr_pages = calc_pages_for(0, len);
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
- ret = -ENOMEM;
- if (!pages)
- goto out;
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
for (i = 0; i < nr_pages; ++i) {
page = list_entry(page_list->prev, struct page, lru);
BUG_ON(PageLocked(page));
@@ -378,6 +406,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
if (ret < 0)
goto out_pages;
ceph_osdc_put_request(req);
+
+ /* After adding locked pages to page cache, the inode holds cache cap.
+ * So we can drop our cap refs. */
+ if (got)
+ ceph_put_cap_refs(ci, got);
+
return nr_pages;
out_pages:
@@ -386,8 +420,11 @@ out_pages:
unlock_page(pages[i]);
}
ceph_put_page_vector(pages, nr_pages, false);
-out:
+out_put:
ceph_osdc_put_request(req);
+out:
+ if (got)
+ ceph_put_cap_refs(ci, got);
return ret;
}
@@ -424,7 +461,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
rc = start_read(inode, page_list, max);
if (rc < 0)
goto out;
- BUG_ON(rc == 0);
}
out:
ceph_fscache_readpages_cancel(inode, page_list);
@@ -438,7 +474,9 @@ out:
* only snap context we are allowed to write back.
*/
static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- loff_t *snap_size)
+ loff_t *snap_size,
+ u64 *truncate_size,
+ u32 *truncate_seq)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc = NULL;
@@ -452,6 +490,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
snapc = ceph_get_snap_context(capsnap->context);
if (snap_size)
*snap_size = capsnap->size;
+ if (truncate_size)
+ *truncate_size = capsnap->truncate_size;
+ if (truncate_seq)
+ *truncate_seq = capsnap->truncate_seq;
break;
}
}
@@ -459,6 +501,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
snapc = ceph_get_snap_context(ci->i_head_snapc);
dout(" head snapc %p has %d dirty pages\n",
snapc, ci->i_wrbuffer_ref_head);
+ if (truncate_size)
+ *truncate_size = ci->i_truncate_size;
+ if (truncate_seq)
+ *truncate_seq = ci->i_truncate_seq;
}
spin_unlock(&ci->i_ceph_lock);
return snapc;
@@ -501,7 +547,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p not dirty?\n", inode, page);
goto out;
}
- oldest = get_oldest_context(inode, &snap_size);
+ oldest = get_oldest_context(inode, &snap_size,
+ &truncate_size, &truncate_seq);
if (snapc->seq > oldest->seq) {
dout("writepage %p page %p snapc %p not writeable - noop\n",
inode, page, snapc);
@@ -512,12 +559,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
- spin_lock(&ci->i_ceph_lock);
- truncate_seq = ci->i_truncate_seq;
- truncate_size = ci->i_truncate_size;
if (snap_size == -1)
snap_size = i_size_read(inode);
- spin_unlock(&ci->i_ceph_lock);
/* is this a partial page at end of file? */
if (page_off >= snap_size) {
@@ -764,7 +807,8 @@ retry:
/* find oldest snap context with dirty data */
ceph_put_snap_context(snapc);
snap_size = -1;
- snapc = get_oldest_context(inode, &snap_size);
+ snapc = get_oldest_context(inode, &snap_size,
+ &truncate_size, &truncate_seq);
if (!snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
@@ -774,11 +818,7 @@ retry:
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
- spin_lock(&ci->i_ceph_lock);
- truncate_seq = ci->i_truncate_seq;
- truncate_size = ci->i_truncate_size;
i_size = i_size_read(inode);
- spin_unlock(&ci->i_ceph_lock);
if (last_snapc && snapc != last_snapc) {
/* if we switched to a newer snapc, restart our scan at the
@@ -1124,7 +1164,8 @@ out:
static int context_is_writeable_or_written(struct inode *inode,
struct ceph_snap_context *snapc)
{
- struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL,
+ NULL, NULL);
int ret = !oldest || snapc->seq <= oldest->seq;
ceph_put_snap_context(oldest);
@@ -1169,7 +1210,7 @@ retry_locked:
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
- oldest = get_oldest_context(inode, NULL);
+ oldest = get_oldest_context(inode, NULL, NULL, NULL);
if (snapc->seq > oldest->seq) {
ceph_put_snap_context(oldest);
@@ -1276,25 +1317,27 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- unsigned from = pos & (PAGE_SIZE - 1);
int check_cap = 0;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
/* zero the stale part of the page if we did a short copy */
- if (copied < len)
- zero_user_segment(page, from+copied, len);
+ if (!PageUptodate(page)) {
+ if (copied < len) {
+ copied = 0;
+ goto out;
+ }
+ SetPageUptodate(page);
+ }
/* did file size increase? */
if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
set_page_dirty(page);
+out:
unlock_page(page);
put_page(page);
@@ -1371,9 +1414,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- ci->i_inline_version == CEPH_INLINE_NONE)
+ ci->i_inline_version == CEPH_INLINE_NONE) {
+ current->journal_info = vma->vm_file;
ret = filemap_fault(vma, vmf);
- else
+ current->journal_info = NULL;
+ } else
ret = -EAGAIN;
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
@@ -1905,6 +1950,15 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
struct ceph_string *pool_ns;
int ret, flags;
+ if (ci->i_vino.snap != CEPH_NOSNAP) {
+ /*
+ * Pool permission check needs to write to the first object.
+ * But for snapshot, head of the first object may have alread
+ * been deleted. Skip check to avoid creating orphan object.
+ */
+ return 0;
+ }
+
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 16e6ded0b7f2..94fd76d04683 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -987,96 +987,127 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
__cap_delay_cancel(mdsc, ci);
}
+struct cap_msg_args {
+ struct ceph_mds_session *session;
+ u64 ino, cid, follows;
+ u64 flush_tid, oldest_flush_tid, size, max_size;
+ u64 xattr_version;
+ struct ceph_buffer *xattr_buf;
+ struct timespec atime, mtime, ctime;
+ int op, caps, wanted, dirty;
+ u32 seq, issue_seq, mseq, time_warp_seq;
+ u32 flags;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ bool inline_data;
+};
+
/*
* Build and send a cap message to the given MDS.
*
* Caller should be holding s_mutex.
*/
-static int send_cap_msg(struct ceph_mds_session *session,
- u64 ino, u64 cid, int op,
- int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u64 oldest_flush_tid,
- u32 issue_seq, u32 mseq, u64 size, u64 max_size,
- struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u32 time_warp_seq,
- kuid_t uid, kgid_t gid, umode_t mode,
- u64 xattr_version,
- struct ceph_buffer *xattrs_buf,
- u64 follows, bool inline_data)
+static int send_cap_msg(struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
struct ceph_msg *msg;
void *p;
size_t extra_len;
+ struct timespec zerotime = {0};
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
- cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
- ceph_cap_string(dirty),
- seq, issue_seq, flush_tid, oldest_flush_tid,
- mseq, follows, size, max_size,
- xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+ " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
+ arg->cid, arg->ino, ceph_cap_string(arg->caps),
+ ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
+ arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
+ arg->mseq, arg->follows, arg->size, arg->max_size,
+ arg->xattr_version,
+ arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
/* flock buffer size + inline version + inline data size +
* osd_epoch_barrier + oldest_flush_tid */
- extra_len = 4 + 8 + 4 + 4 + 8;
+ extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
GFP_NOFS, false);
if (!msg)
return -ENOMEM;
- msg->hdr.version = cpu_to_le16(6);
- msg->hdr.tid = cpu_to_le64(flush_tid);
+ msg->hdr.version = cpu_to_le16(10);
+ msg->hdr.tid = cpu_to_le64(arg->flush_tid);
fc = msg->front.iov_base;
memset(fc, 0, sizeof(*fc));
- fc->cap_id = cpu_to_le64(cid);
- fc->op = cpu_to_le32(op);
- fc->seq = cpu_to_le32(seq);
- fc->issue_seq = cpu_to_le32(issue_seq);
- fc->migrate_seq = cpu_to_le32(mseq);
- fc->caps = cpu_to_le32(caps);
- fc->wanted = cpu_to_le32(wanted);
- fc->dirty = cpu_to_le32(dirty);
- fc->ino = cpu_to_le64(ino);
- fc->snap_follows = cpu_to_le64(follows);
-
- fc->size = cpu_to_le64(size);
- fc->max_size = cpu_to_le64(max_size);
- if (mtime)
- ceph_encode_timespec(&fc->mtime, mtime);
- if (atime)
- ceph_encode_timespec(&fc->atime, atime);
- if (ctime)
- ceph_encode_timespec(&fc->ctime, ctime);
- fc->time_warp_seq = cpu_to_le32(time_warp_seq);
-
- fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
- fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
- fc->mode = cpu_to_le32(mode);
+ fc->cap_id = cpu_to_le64(arg->cid);
+ fc->op = cpu_to_le32(arg->op);
+ fc->seq = cpu_to_le32(arg->seq);
+ fc->issue_seq = cpu_to_le32(arg->issue_seq);
+ fc->migrate_seq = cpu_to_le32(arg->mseq);
+ fc->caps = cpu_to_le32(arg->caps);
+ fc->wanted = cpu_to_le32(arg->wanted);
+ fc->dirty = cpu_to_le32(arg->dirty);
+ fc->ino = cpu_to_le64(arg->ino);
+ fc->snap_follows = cpu_to_le64(arg->follows);
+
+ fc->size = cpu_to_le64(arg->size);
+ fc->max_size = cpu_to_le64(arg->max_size);
+ ceph_encode_timespec(&fc->mtime, &arg->mtime);
+ ceph_encode_timespec(&fc->atime, &arg->atime);
+ ceph_encode_timespec(&fc->ctime, &arg->ctime);
+ fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
+
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
+ fc->mode = cpu_to_le32(arg->mode);
+
+ fc->xattr_version = cpu_to_le64(arg->xattr_version);
+ if (arg->xattr_buf) {
+ msg->middle = ceph_buffer_get(arg->xattr_buf);
+ fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+ msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+ }
p = fc + 1;
- /* flock buffer size */
+ /* flock buffer size (version 2) */
ceph_encode_32(&p, 0);
- /* inline version */
- ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
+ /* inline version (version 4) */
+ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
- /* osd_epoch_barrier */
+ /* osd_epoch_barrier (version 5) */
ceph_encode_32(&p, 0);
- /* oldest_flush_tid */
- ceph_encode_64(&p, oldest_flush_tid);
+ /* oldest_flush_tid (version 6) */
+ ceph_encode_64(&p, arg->oldest_flush_tid);
- fc->xattr_version = cpu_to_le64(xattr_version);
- if (xattrs_buf) {
- msg->middle = ceph_buffer_get(xattrs_buf);
- fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- }
+ /*
+ * caller_uid/caller_gid (version 7)
+ *
+ * Currently, we don't properly track which caller dirtied the caps
+ * last, and force a flush of them when there is a conflict. For now,
+ * just set this to 0:0, to emulate how the MDS has worked up to now.
+ */
+ ceph_encode_32(&p, 0);
+ ceph_encode_32(&p, 0);
+
+ /* pool namespace (version 8) (mds always ignores this) */
+ ceph_encode_32(&p, 0);
- ceph_con_send(&session->s_con, msg);
+ /*
+ * btime and change_attr (version 9)
+ *
+ * We just zero these out for now, as the MDS ignores them unless
+ * the requisite feature flags are set (which we don't do yet).
+ */
+ ceph_encode_timespec(p, &zerotime);
+ p += sizeof(struct ceph_timespec);
+ ceph_encode_64(&p, 0);
+
+ /* Advisory flags (version 10) */
+ ceph_encode_32(&p, arg->flags);
+
+ ceph_con_send(&arg->session->s_con, msg);
return 0;
}
@@ -1115,27 +1146,17 @@ void ceph_queue_caps_release(struct inode *inode)
* caller should hold snap_rwsem (read), s_mutex.
*/
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, int used, int want, int retain, int flushing,
- u64 flush_tid, u64 oldest_flush_tid)
+ int op, bool sync, int used, int want, int retain,
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
__releases(cap->ci->i_ceph_lock)
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
- u64 cap_id = cap->cap_id;
- int held, revoking, dropping, keep;
- u64 follows, size, max_size;
- u32 seq, issue_seq, mseq, time_warp_seq;
- struct timespec mtime, atime, ctime;
+ struct cap_msg_args arg;
+ int held, revoking, dropping;
int wake = 0;
- umode_t mode;
- kuid_t uid;
- kgid_t gid;
- struct ceph_mds_session *session;
- u64 xattr_version = 0;
- struct ceph_buffer *xattr_blob = NULL;
int delayed = 0;
int ret;
- bool inline_data;
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
@@ -1148,7 +1169,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- session = cap->session;
+ arg.session = cap->session;
/* don't release wanted unless we've waited a bit. */
if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
@@ -1177,40 +1198,51 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
- follows = flushing ? ci->i_head_snapc->seq : 0;
-
- keep = cap->implemented;
- seq = cap->seq;
- issue_seq = cap->issue_seq;
- mseq = cap->mseq;
- size = inode->i_size;
- ci->i_reported_size = size;
- max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = max_size;
- mtime = inode->i_mtime;
- atime = inode->i_atime;
- ctime = inode->i_ctime;
- time_warp_seq = ci->i_time_warp_seq;
- uid = inode->i_uid;
- gid = inode->i_gid;
- mode = inode->i_mode;
+ arg.ino = ceph_vino(inode).ino;
+ arg.cid = cap->cap_id;
+ arg.follows = flushing ? ci->i_head_snapc->seq : 0;
+ arg.flush_tid = flush_tid;
+ arg.oldest_flush_tid = oldest_flush_tid;
+
+ arg.size = inode->i_size;
+ ci->i_reported_size = arg.size;
+ arg.max_size = ci->i_wanted_max_size;
+ ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
__ceph_build_xattrs_blob(ci);
- xattr_blob = ci->i_xattrs.blob;
- xattr_version = ci->i_xattrs.version;
+ arg.xattr_version = ci->i_xattrs.version;
+ arg.xattr_buf = ci->i_xattrs.blob;
+ } else {
+ arg.xattr_buf = NULL;
}
- inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+ arg.mtime = inode->i_mtime;
+ arg.atime = inode->i_atime;
+ arg.ctime = inode->i_ctime;
+
+ arg.op = op;
+ arg.caps = cap->implemented;
+ arg.wanted = want;
+ arg.dirty = flushing;
+
+ arg.seq = cap->seq;
+ arg.issue_seq = cap->issue_seq;
+ arg.mseq = cap->mseq;
+ arg.time_warp_seq = ci->i_time_warp_seq;
+
+ arg.uid = inode->i_uid;
+ arg.gid = inode->i_gid;
+ arg.mode = inode->i_mode;
+
+ arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+ arg.flags = 0;
+ if (sync)
+ arg.flags |= CEPH_CLIENT_CAPS_SYNC;
spin_unlock(&ci->i_ceph_lock);
- ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq,
- flush_tid, oldest_flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, &ctime, time_warp_seq,
- uid, gid, mode, xattr_version, xattr_blob,
- follows, inline_data);
+ ret = send_cap_msg(&arg);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
delayed = 1;
@@ -1227,15 +1259,42 @@ static inline int __send_flush_snap(struct inode *inode,
struct ceph_cap_snap *capsnap,
u32 mseq, u64 oldest_flush_tid)
{
- return send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->cap_flush.tid,
- oldest_flush_tid, 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
+ struct cap_msg_args arg;
+
+ arg.session = session;
+ arg.ino = ceph_vino(inode).ino;
+ arg.cid = 0;
+ arg.follows = capsnap->follows;
+ arg.flush_tid = capsnap->cap_flush.tid;
+ arg.oldest_flush_tid = oldest_flush_tid;
+
+ arg.size = capsnap->size;
+ arg.max_size = 0;
+ arg.xattr_version = capsnap->xattr_version;
+ arg.xattr_buf = capsnap->xattr_blob;
+
+ arg.atime = capsnap->atime;
+ arg.mtime = capsnap->mtime;
+ arg.ctime = capsnap->ctime;
+
+ arg.op = CEPH_CAP_OP_FLUSHSNAP;
+ arg.caps = capsnap->issued;
+ arg.wanted = 0;
+ arg.dirty = capsnap->dirty;
+
+ arg.seq = 0;
+ arg.issue_seq = 0;
+ arg.mseq = mseq;
+ arg.time_warp_seq = capsnap->time_warp_seq;
+
+ arg.uid = capsnap->uid;
+ arg.gid = capsnap->gid;
+ arg.mode = capsnap->mode;
+
+ arg.inline_data = capsnap->inline_data;
+ arg.flags = 0;
+
+ return send_cap_msg(&arg);
}
/*
@@ -1858,9 +1917,9 @@ ack:
sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
- want, retain, flushing,
- flush_tid, oldest_flush_tid);
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+ cap_used, want, retain, flushing,
+ flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -1924,9 +1983,9 @@ retry:
&flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- (cap->issued | cap->implemented),
- flushing, flush_tid, oldest_flush_tid);
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+ used, want, (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
@@ -1996,7 +2055,7 @@ static int unsafe_request_wait(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+ dout("unsafe_request_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@@ -2119,7 +2178,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
inode, cap, cf->tid, ceph_cap_string(cf->caps));
ci->i_ceph_flags |= CEPH_I_NODELAY;
ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
+ false, __ceph_caps_used(ci),
__ceph_caps_wanted(ci),
cap->issued | cap->implemented,
cf->caps, cf->tid, oldest_flush_tid);
@@ -2479,6 +2538,27 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+{
+ int ret, err = 0;
+
+ BUG_ON(need & ~CEPH_CAP_FILE_RD);
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+ ret = ceph_pool_perm_check(ci, need);
+ if (ret < 0)
+ return ret;
+
+ ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+ if (ret) {
+ if (err == -EAGAIN) {
+ ret = 0;
+ } else if (err < 0) {
+ ret = err;
+ }
+ }
+ return ret;
+}
+
/*
* Wait for caps, and take cap references. If we can't get a WR cap
* due to a small max_size, make sure we check_max_size (and possibly
@@ -2507,9 +2587,20 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
if (err < 0)
ret = err;
} else {
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- true, &_got, &err));
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ add_wait_queue(&ci->i_cap_wq, &wait);
+
+ while (!try_get_cap_refs(ci, need, want, endoff,
+ true, &_got, &err)) {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+
+ remove_wait_queue(&ci->i_cap_wq, &wait);
+
if (err == -EAGAIN)
continue;
if (err < 0)
@@ -3570,6 +3661,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap->cap_id = le64_to_cpu(h->cap_id);
cap->mseq = mseq;
cap->seq = seq;
+ cap->issue_seq = seq;
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps,
&session->s_cap_releases);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a594c7879cc2..8ab1fdf0bd49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -32,40 +32,19 @@ const struct dentry_operations ceph_dentry_ops;
/*
* Initialize ceph dentry state.
*/
-int ceph_init_dentry(struct dentry *dentry)
+static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
- if (dentry->d_fsdata)
- return 0;
-
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
return -ENOMEM; /* oh well */
- spin_lock(&dentry->d_lock);
- if (dentry->d_fsdata) {
- /* lost a race */
- kmem_cache_free(ceph_dentry_cachep, di);
- goto out_unlock;
- }
-
- if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP)
- d_set_d_op(dentry, &ceph_dentry_ops);
- else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
- d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
- else
- d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
di->dentry = dentry;
di->lease_session = NULL;
di->time = jiffies;
- /* avoid reordering d_fsdata setup so that the check above is safe */
- smp_mb();
dentry->d_fsdata = di;
ceph_dentry_lru_add(dentry);
-out_unlock:
- spin_unlock(&dentry->d_lock);
return 0;
}
@@ -737,10 +716,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- err = ceph_init_dentry(dentry);
- if (err < 0)
- return ERR_PTR(err);
-
/* can we conclude ENOENT locally? */
if (d_really_is_negative(dentry)) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -1255,7 +1230,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct ceph_mds_client *mdsc =
ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
- int op, mask, err;
+ int op, err;
+ u32 mask;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -1270,7 +1246,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
- req->r_args.getattr.mask = mask;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
err = ceph_mdsc_do_request(mdsc, NULL, req);
switch (err) {
@@ -1323,16 +1299,6 @@ static void ceph_d_release(struct dentry *dentry)
kmem_cache_free(ceph_dentry_cachep, di);
}
-static int ceph_snapdir_d_revalidate(struct dentry *dentry,
- unsigned int flags)
-{
- /*
- * Eventually, we'll want to revalidate snapped metadata
- * too... probably...
- */
- return 1;
-}
-
/*
* When the VFS prunes a dentry from the cache, we need to clear the
* complete flag on the parent directory.
@@ -1351,6 +1317,9 @@ static void ceph_d_prune(struct dentry *dentry)
if (d_unhashed(dentry))
return;
+ if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
+ return;
+
/*
* we hold d_lock, so d_parent is stable, and d_fsdata is never
* cleared until d_release
@@ -1521,14 +1490,5 @@ const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
-};
-
-const struct dentry_operations ceph_snapdir_dentry_ops = {
- .d_revalidate = ceph_snapdir_d_revalidate,
- .d_release = ceph_d_release,
-};
-
-const struct dentry_operations ceph_snap_dentry_ops = {
- .d_release = ceph_d_release,
- .d_prune = ceph_d_prune,
+ .d_init = ceph_d_init,
};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 1780218a48f0..180bbef760f2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -62,7 +62,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
- struct dentry *dentry;
struct ceph_vino vino;
int err;
@@ -94,16 +93,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
return ERR_PTR(-ESTALE);
}
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry))
- return dentry;
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- dput(dentry);
- return ERR_PTR(err);
- }
- dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
- return dentry;
+ return d_obtain_alias(inode);
}
/*
@@ -131,7 +121,6 @@ static struct dentry *__get_parent(struct super_block *sb,
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct ceph_mds_request *req;
struct inode *inode;
- struct dentry *dentry;
int mask;
int err;
@@ -164,18 +153,7 @@ static struct dentry *__get_parent(struct super_block *sb,
if (!inode)
return ERR_PTR(-ENOENT);
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry))
- return dentry;
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- dput(dentry);
- return ERR_PTR(err);
- }
- dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
- child ? ceph_ino(d_inode(child)) : ino,
- dentry, ceph_vinop(inode));
- return dentry;
+ return d_obtain_alias(inode);
}
static struct dentry *ceph_get_parent(struct dentry *child)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f995e3528a33..045d30d26624 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -351,10 +351,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
- err = ceph_init_dentry(dentry);
- if (err < 0)
- return err;
-
if (flags & O_CREAT) {
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
@@ -458,71 +454,60 @@ enum {
* only return a short read to the caller if we hit EOF.
*/
static int striped_read(struct inode *inode,
- u64 off, u64 len,
+ u64 pos, u64 len,
struct page **pages, int num_pages,
- int *checkeof)
+ int page_align, int *checkeof)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len, left;
+ u64 this_len;
loff_t i_size;
- int page_align, pages_left;
- int read, ret;
- struct page **page_pos;
+ int page_idx;
+ int ret, read = 0;
bool hit_stripe, was_short;
/*
* we may need to do multiple reads. not atomic, unfortunately.
*/
- pos = off;
- left = len;
- page_pos = pages;
- pages_left = num_pages;
- read = 0;
-
more:
- page_align = pos & ~PAGE_MASK;
- this_len = left;
+ this_len = len;
+ page_idx = (page_align + read) >> PAGE_SHIFT;
ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- page_pos, pages_left, page_align);
+ ci->i_truncate_seq, ci->i_truncate_size,
+ pages + page_idx, num_pages - page_idx,
+ ((page_align + read) & ~PAGE_MASK));
if (ret == -ENOENT)
ret = 0;
- hit_stripe = this_len < left;
+ hit_stripe = this_len < len;
was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
+ dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
i_size = i_size_read(inode);
if (ret >= 0) {
- int didpages;
if (was_short && (pos + ret < i_size)) {
int zlen = min(this_len - ret, i_size - pos - ret);
- int zoff = (off & ~PAGE_MASK) + read + ret;
+ int zoff = page_align + read + ret;
dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + zlen);
+ pos + ret, pos + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
ret += zlen;
}
- didpages = (page_align + ret) >> PAGE_SHIFT;
+ read += ret;
pos += ret;
- read = pos - off;
- left -= ret;
- page_pos += didpages;
- pages_left -= didpages;
+ len -= ret;
/* hit stripe and need continue*/
- if (left && hit_stripe && pos < i_size)
+ if (len && hit_stripe && pos < i_size)
goto more;
}
if (read > 0) {
ret = read;
/* did we bounce off eof? */
- if (pos + left > i_size)
+ if (pos + len > i_size)
*checkeof = CHECK_EOF;
}
@@ -536,15 +521,16 @@ more:
*
* If the read spans object boundary, just do multiple reads.
*/
-static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
- int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
+ int *checkeof)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct page **pages;
u64 off = iocb->ki_pos;
- int num_pages, ret;
- size_t len = iov_iter_count(i);
+ int num_pages;
+ ssize_t ret;
+ size_t len = iov_iter_count(to);
dout("sync_read on file %p %llu~%u %s\n", file, off,
(unsigned)len,
@@ -563,35 +549,56 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
if (ret < 0)
return ret;
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
- ret = striped_read(inode, off, len, pages,
- num_pages, checkeof);
- if (ret > 0) {
- int l, k = 0;
- size_t left = ret;
-
- while (left) {
- size_t page_off = off & ~PAGE_MASK;
- size_t copy = min_t(size_t, left,
- PAGE_SIZE - page_off);
- l = copy_page_to_iter(pages[k++], page_off, copy, i);
- off += l;
- left -= l;
- if (l < copy)
- break;
+ if (unlikely(to->type & ITER_PIPE)) {
+ size_t page_off;
+ ret = iov_iter_get_pages_alloc(to, &pages, len,
+ &page_off);
+ if (ret <= 0)
+ return -ENOMEM;
+ num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+
+ ret = striped_read(inode, off, ret, pages, num_pages,
+ page_off, checkeof);
+ if (ret > 0) {
+ iov_iter_advance(to, ret);
+ off += ret;
+ } else {
+ iov_iter_advance(to, 0);
}
+ ceph_put_page_vector(pages, num_pages, false);
+ } else {
+ num_pages = calc_pages_for(off, len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = striped_read(inode, off, len, pages, num_pages,
+ (off & ~PAGE_MASK), checkeof);
+ if (ret > 0) {
+ int l, k = 0;
+ size_t left = ret;
+
+ while (left) {
+ size_t page_off = off & ~PAGE_MASK;
+ size_t copy = min_t(size_t, left,
+ PAGE_SIZE - page_off);
+ l = copy_page_to_iter(pages[k++], page_off,
+ copy, to);
+ off += l;
+ left -= l;
+ if (l < copy)
+ break;
+ }
+ }
+ ceph_release_page_vector(pages, num_pages);
}
- ceph_release_page_vector(pages, num_pages);
if (off > iocb->ki_pos) {
ret = off - iocb->ki_pos;
iocb->ki_pos = off;
}
- dout("sync_read result %d\n", ret);
+ dout("sync_read result %zd\n", ret);
return ret;
}
@@ -853,7 +860,7 @@ void ceph_sync_write_wait(struct inode *inode)
dout("sync_write_wait on tid %llu (until %llu)\n",
req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
+ wait_for_completion(&req->r_done_completion);
ceph_osdc_put_request(req);
spin_lock(&ci->i_unsafe_lock);
@@ -906,7 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
pos >> PAGE_SHIFT,
(pos + count) >> PAGE_SHIFT);
if (ret2 < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret);
+ dout("invalidate_inode_pages2_range returned %d\n", ret2);
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
@@ -1249,8 +1256,9 @@ again:
dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
-
+ current->journal_info = filp;
ret = generic_file_read_iter(iocb, to);
+ current->journal_info = NULL;
}
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
@@ -1770,6 +1778,7 @@ const struct file_operations ceph_file_fops = {
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
+ .splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ef4d04647325..5e659d054b40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -305,7 +305,8 @@ static int frag_tree_split_cmp(const void *l, const void *r)
{
struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
- return ceph_frag_compare(ls->frag, rs->frag);
+ return ceph_frag_compare(le32_to_cpu(ls->frag),
+ le32_to_cpu(rs->frag));
}
static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
@@ -1023,16 +1024,17 @@ static void update_dentry_lease(struct dentry *dentry,
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
struct inode *dir;
- /* only track leases on regular dentries */
- if (dentry->d_op != &ceph_dentry_ops)
- return;
-
spin_lock(&dentry->d_lock);
dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
dentry, duration, ttl);
/* make lease_rdcache_gen match directory */
dir = d_inode(dentry->d_parent);
+
+ /* only track leases on regular dentries */
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ goto out_unlock;
+
di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
if (duration == 0)
@@ -1202,12 +1204,7 @@ retry_lookup:
err = -ENOMEM;
goto done;
}
- err = ceph_init_dentry(dn);
- if (err < 0) {
- dput(dn);
- dput(parent);
- goto done;
- }
+ err = 0;
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != vino.ino ||
ceph_snap(d_inode(dn)) != vino.snap)) {
@@ -1561,12 +1558,6 @@ retry_lookup:
err = -ENOMEM;
goto out;
}
- ret = ceph_init_dentry(dn);
- if (ret < 0) {
- dput(dn);
- err = ret;
- goto out;
- }
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != vino.ino ||
ceph_snap(d_inode(dn)) != vino.snap)) {
@@ -1879,7 +1870,6 @@ retry:
* symlinks
*/
static const struct inode_operations ceph_symlink_iops = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 815acd1a56d4..c9d2e553a6c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -288,12 +288,13 @@ static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
- if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+ u32 op = le32_to_cpu(info->head->op);
+
+ if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
- else if (info->head->op == CEPH_MDS_OP_READDIR ||
- info->head->op == CEPH_MDS_OP_LSSNAP)
+ else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_dir(p, end, info, features);
- else if (info->head->op == CEPH_MDS_OP_CREATE)
+ else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
return -EIO;
@@ -2100,17 +2101,31 @@ static int __do_request(struct ceph_mds_client *mdsc,
err = -EIO;
goto finish;
}
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
+ if (mdsc->mdsmap->m_epoch == 0) {
+ dout("do_request no mdsmap, waiting for map\n");
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ goto finish;
+ }
+ if (!(mdsc->fsc->mount_options->flags &
+ CEPH_MOUNT_OPT_MOUNTWAIT) &&
+ !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
+ err = -ENOENT;
+ pr_info("probably no mds server is up\n");
+ goto finish;
+ }
+ }
put_request_session(req);
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
- if (mdsc->mdsmap_err) {
- err = mdsc->mdsmap_err;
- dout("do_request mdsmap err %d\n", err);
- goto finish;
- }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
goto out;
@@ -3943,13 +3958,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
}
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
+static int verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
- return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+ return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
}
static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 8c3591a7fbae..5454e2327a5f 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -42,6 +42,60 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return i;
}
+#define __decode_and_drop_type(p, end, type, bad) \
+ do { \
+ if (*p + sizeof(type) > end) \
+ goto bad; \
+ *p += sizeof(type); \
+ } while (0)
+
+#define __decode_and_drop_set(p, end, type, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = sizeof(type) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+#define __decode_and_drop_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = (sizeof(ktype) + sizeof(vtype)) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+
+static int __decode_and_drop_compat_set(void **p, void* end)
+{
+ int i;
+ /* compat, ro_compat, incompat*/
+ for (i = 0; i < 3; i++) {
+ u32 n;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ /* mask */
+ *p += sizeof(u64);
+ /* names (map<u64, string>) */
+ n = ceph_decode_32(p);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
+ bad);
+ *p += sizeof(u64);
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ }
+ return 0;
+bad:
+ return -1;
+}
+
/*
* Decode an MDS map
*
@@ -55,6 +109,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
int i, j, n;
int err = -EINVAL;
u8 mdsmap_v, mdsmap_cv;
+ u16 mdsmap_ev;
m = kzalloc(sizeof(*m), GFP_NOFS);
if (m == NULL)
@@ -83,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
if (m->m_info == NULL)
- goto badmem;
+ goto nomem;
/* pick out active nodes from mds_info (state > 0) */
n = ceph_decode_32(p);
@@ -166,7 +221,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = kcalloc(num_export_targets,
sizeof(u32), GFP_NOFS);
if (info->export_targets == NULL)
- goto badmem;
+ goto nomem;
for (j = 0; j < num_export_targets; j++)
info->export_targets[j] =
ceph_decode_32(&pexport_targets);
@@ -180,24 +235,104 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_num_data_pg_pools = n;
m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
- goto badmem;
+ goto nomem;
ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
m->m_data_pg_pools[i] = ceph_decode_64(p);
m->m_cas_pg_pool = ceph_decode_64(p);
+ m->m_enabled = m->m_epoch > 1;
+
+ mdsmap_ev = 1;
+ if (mdsmap_v >= 2) {
+ ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
+ }
+ if (mdsmap_ev >= 3) {
+ if (__decode_and_drop_compat_set(p, end) < 0)
+ goto bad_ext;
+ }
+ /* metadata_pool */
+ if (mdsmap_ev < 5) {
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ } else {
+ __decode_and_drop_type(p, end, u64, bad_ext);
+ }
- /* ok, we don't care about the rest. */
+ /* created + modified + tableserver */
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, u32, bad_ext);
+
+ /* in */
+ {
+ int num_laggy = 0;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
+
+ for (i = 0; i < n; i++) {
+ s32 mds = ceph_decode_32(p);
+ if (mds >= 0 && mds < m->m_max_mds) {
+ if (m->m_info[mds].laggy)
+ num_laggy++;
+ }
+ }
+ m->m_num_laggy = num_laggy;
+ }
+
+ /* inc */
+ __decode_and_drop_map(p, end, u32, u32, bad_ext);
+ /* up */
+ __decode_and_drop_map(p, end, u32, u64, bad_ext);
+ /* failed */
+ __decode_and_drop_set(p, end, u32, bad_ext);
+ /* stopped */
+ __decode_and_drop_set(p, end, u32, bad_ext);
+
+ if (mdsmap_ev >= 4) {
+ /* last_failure_osd_epoch */
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ }
+ if (mdsmap_ev >= 6) {
+ /* ever_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ /* explicitly_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 7) {
+ /* inline_data_enabled */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 8) {
+ u32 name_len;
+ /* enabled */
+ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
+ ceph_decode_32_safe(p, end, name_len, bad_ext);
+ ceph_decode_need(p, end, name_len, bad_ext);
+ *p += name_len;
+ }
+ /* damaged */
+ if (mdsmap_ev >= 9) {
+ size_t need;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ need = sizeof(u32) * n;
+ ceph_decode_need(p, end, need, bad_ext);
+ *p += need;
+ m->m_damaged = n > 0;
+ } else {
+ m->m_damaged = false;
+ }
+bad_ext:
*p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
-
-badmem:
+nomem:
err = -ENOMEM;
+ goto out_err;
bad:
pr_err("corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
+out_err:
ceph_mdsmap_destroy(m);
return ERR_PTR(err);
}
@@ -212,3 +347,19 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
kfree(m->m_data_pg_pools);
kfree(m);
}
+
+bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
+{
+ int i, nr_active = 0;
+ if (!m->m_enabled)
+ return false;
+ if (m->m_damaged)
+ return false;
+ if (m->m_num_laggy > 0)
+ return false;
+ for (i = 0; i < m->m_max_mds; i++) {
+ if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
+ nr_active++;
+ }
+ return nr_active > 0;
+}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9ff5219d849e..8f8b41c2ef0f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -593,6 +593,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->atime = inode->i_atime;
capsnap->ctime = inode->i_ctime;
capsnap->time_warp_seq = ci->i_time_warp_seq;
+ capsnap->truncate_size = ci->i_truncate_size;
+ capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
"still has %d dirty pages\n", inode, capsnap,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b382e5910eea..6bd20d707bfd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -137,6 +137,8 @@ enum {
Opt_nofscache,
Opt_poolperm,
Opt_nopoolperm,
+ Opt_require_active_mds,
+ Opt_norequire_active_mds,
#ifdef CONFIG_CEPH_FS_POSIX_ACL
Opt_acl,
#endif
@@ -171,6 +173,8 @@ static match_table_t fsopt_tokens = {
{Opt_nofscache, "nofsc"},
{Opt_poolperm, "poolperm"},
{Opt_nopoolperm, "nopoolperm"},
+ {Opt_require_active_mds, "require_active_mds"},
+ {Opt_norequire_active_mds, "norequire_active_mds"},
#ifdef CONFIG_CEPH_FS_POSIX_ACL
{Opt_acl, "acl"},
#endif
@@ -287,6 +291,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nopoolperm:
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
break;
+ case Opt_require_active_mds:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
+ break;
+ case Opt_norequire_active_mds:
+ fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= MS_POSIXACL;
@@ -795,7 +805,6 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
root = ERR_PTR(-ENOMEM);
goto out;
}
- ceph_init_dentry(root);
dout("open_root_inode success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
@@ -879,6 +888,7 @@ static int ceph_set_super(struct super_block *s, void *data)
fsc->sb = s;
s->s_op = &ceph_super_ops;
+ s->s_d_op = &ceph_dentry_ops;
s->s_export_op = &ceph_export_ops;
s->s_time_gran = 1000; /* 1000 ns == 1 us */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3e3fa9163059..3373b61faefd 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,6 +36,7 @@
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
+#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
@@ -180,6 +181,8 @@ struct ceph_cap_snap {
u64 size;
struct timespec mtime, atime, ctime;
u64 time_warp_seq;
+ u64 truncate_size;
+ u32 truncate_seq;
int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */
bool inline_data;
@@ -905,6 +908,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
+extern int ceph_try_get_caps(struct ceph_inode_info *ci,
+ int need, int want, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
@@ -934,8 +939,7 @@ extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
extern const struct inode_operations ceph_dir_iops;
extern const struct inode_operations ceph_snapdir_iops;
-extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
- ceph_snapdir_dentry_ops;
+extern const struct dentry_operations ceph_dentry_ops;
extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -951,13 +955,6 @@ extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
-/*
- * our d_ops vary depending on whether the inode is live,
- * snapshotted (read-only), or a virtual ".snap" directory.
- */
-int ceph_init_dentry(struct dentry *dentry);
-
-
/* ioctl.c */
extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 3d03e48a9213..9727e1dcacd5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -24,7 +24,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5eb04129f938..66bd7fa9b7a6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -699,11 +699,15 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
if (!ses->domainName) {
- rc = find_domain_name(ses, nls_cp);
- if (rc) {
- cifs_dbg(VFS, "error %d finding domain name\n",
- rc);
- goto setup_ntlmv2_rsp_ret;
+ if (ses->domainAuto) {
+ rc = find_domain_name(ses, nls_cp);
+ if (rc) {
+ cifs_dbg(VFS, "error %d finding domain name\n",
+ rc);
+ goto setup_ntlmv2_rsp_ret;
+ }
+ } else {
+ ses->domainName = kstrdup("", GFP_KERNEL);
}
}
} else {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 15261ba464c5..70f4e65fced2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -615,7 +615,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
return dget(sb->s_root);
full_path = cifs_build_path_to_root(vol, cifs_sb,
- cifs_sb_master_tcon(cifs_sb));
+ cifs_sb_master_tcon(cifs_sb), 0);
if (full_path == NULL)
return ERR_PTR(-ENOMEM);
@@ -914,7 +914,6 @@ const struct inode_operations cifs_file_inode_ops = {
};
const struct inode_operations cifs_symlink_inode_ops = {
- .readlink = generic_readlink,
.get_link = cifs_get_link,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1f17f6bd7a60..7ea8a3393936 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -514,6 +514,7 @@ struct smb_vol {
bool persistent:1;
bool nopersistent:1;
bool resilient:1; /* noresilient not required since not fored for CA */
+ bool domainauto:1;
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -525,6 +526,7 @@ struct smb_vol {
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
unsigned int echo_interval; /* echo interval in secs */
+ __u64 snapshot_time; /* needed for timewarp tokens */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
};
@@ -646,6 +648,8 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
__u8 preauth_hash[512];
+ struct delayed_work reconnect; /* reconnect workqueue job */
+ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
#endif /* CONFIG_CIFS_SMB2 */
unsigned long echo_interval;
};
@@ -827,6 +831,7 @@ struct cifs_ses {
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
+ bool domainAuto:1;
#ifdef CONFIG_CIFS_SMB2
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
@@ -849,6 +854,7 @@ cap_unix(struct cifs_ses *ses)
struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
+ struct list_head rlist; /* reconnect list */
struct list_head openFileList;
spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
@@ -922,6 +928,7 @@ struct cifs_tcon {
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
+ bool need_reopen_files:1; /* need to reopen tcon file handles */
bool use_resilient:1; /* use resilient instead of durable handles */
bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
@@ -932,6 +939,7 @@ struct cifs_tcon {
__u32 maximal_access;
__u32 vol_serial_number;
__le64 vol_create_time;
+ __u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */
__u32 ss_flags; /* sector size flags */
__u32 perf_sector_size; /* best sector size for perf */
__u32 max_chunks;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ced0e42ce460..c7b3c841e660 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -63,7 +63,8 @@ extern void exit_cifs_spnego(void);
extern char *build_path_from_dentry(struct dentry *);
extern char *cifs_build_path_to_root(struct smb_vol *vol,
struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon);
+ struct cifs_tcon *tcon,
+ int add_treename);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
@@ -206,6 +207,9 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
struct tcon_link *tlink,
struct cifs_pending_open *open);
extern void cifs_del_pending_open(struct cifs_pending_open *open);
+extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
+ int from_reconnect);
+extern void cifs_put_tcon(struct cifs_tcon *tcon);
#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
extern void cifs_dfs_release_automount_timer(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e3fed9249a04..b47261858e6d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -35,7 +35,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/task_io_accounting_ops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsacl.h"
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4547aeddd12b..35ae49ed1f76 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -34,13 +34,14 @@
#include <linux/pagevec.h>
#include <linux/freezer.h>
#include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/inet.h>
#include <linux/module.h>
#include <keys/user-type.h>
#include <net/ipv6.h>
#include <linux/parser.h>
+#include <linux/bvec.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -52,6 +53,9 @@
#include "nterr.h"
#include "rfc1002pdu.h"
#include "fscache.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2proto.h"
+#endif
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -88,6 +92,7 @@ enum {
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
+ Opt_domainauto,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -95,6 +100,7 @@ enum {
Opt_dirmode, Opt_port,
Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits,
+ Opt_snapshot,
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
@@ -176,6 +182,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_nopersistent, "nopersistenthandles"},
{ Opt_resilient, "resilienthandles"},
{ Opt_noresilient, "noresilienthandles"},
+ { Opt_domainauto, "domainauto"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -191,6 +198,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_actimeo, "actimeo=%s" },
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
+ { Opt_snapshot, "snapshot=%s" },
{ Opt_blank_user, "user=" },
{ Opt_blank_user, "username=" },
@@ -1499,6 +1507,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_noresilient:
vol->resilient = false; /* already the default */
break;
+ case Opt_domainauto:
+ vol->domainauto = true;
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -1601,6 +1612,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
vol->echo_interval = option;
break;
+ case Opt_snapshot:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid snapshot time\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->snapshot_time = option;
+ break;
case Opt_max_credits:
if (get_option_ul(args, &option) || (option < 20) ||
(option > 60000)) {
@@ -2100,8 +2119,8 @@ cifs_find_tcp_session(struct smb_vol *vol)
return NULL;
}
-static void
-cifs_put_tcp_session(struct TCP_Server_Info *server)
+void
+cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
{
struct task_struct *task;
@@ -2118,6 +2137,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
cancel_delayed_work_sync(&server->echo);
+#ifdef CONFIG_CIFS_SMB2
+ if (from_reconnect)
+ /*
+ * Avoid deadlock here: reconnect work calls
+ * cifs_put_tcp_session() at its end. Need to be sure
+ * that reconnect work does nothing with server pointer after
+ * that step.
+ */
+ cancel_delayed_work(&server->reconnect);
+ else
+ cancel_delayed_work_sync(&server->reconnect);
+#endif
+
spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsExiting;
spin_unlock(&GlobalMid_Lock);
@@ -2182,6 +2214,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+#ifdef CONFIG_CIFS_SMB2
+ INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
+ mutex_init(&tcp_ses->reconnect_mutex);
+#endif
memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
@@ -2340,7 +2376,7 @@ cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&cifs_tcp_ses_lock);
sesInfoFree(ses);
- cifs_put_tcp_session(server);
+ cifs_put_tcp_session(server, 0);
}
#ifdef CONFIG_KEYS
@@ -2514,7 +2550,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
mutex_unlock(&ses->session_mutex);
/* existing SMB ses has a server reference already */
- cifs_put_tcp_session(server);
+ cifs_put_tcp_session(server, 0);
free_xid(xid);
return ses;
}
@@ -2548,6 +2584,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
if (!ses->domainName)
goto get_ses_fail;
}
+ if (volume_info->domainauto)
+ ses->domainAuto = volume_info->domainauto;
ses->cred_uid = volume_info->cred_uid;
ses->linux_uid = volume_info->linux_uid;
@@ -2586,7 +2624,7 @@ static int match_tcon(struct cifs_tcon *tcon, const char *unc)
}
static struct cifs_tcon *
-cifs_find_tcon(struct cifs_ses *ses, const char *unc)
+cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
{
struct list_head *tmp;
struct cifs_tcon *tcon;
@@ -2594,8 +2632,14 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &ses->tcon_list) {
tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
- if (!match_tcon(tcon, unc))
+ if (!match_tcon(tcon, volume_info->UNC))
continue;
+
+#ifdef CONFIG_CIFS_SMB2
+ if (tcon->snapshot_time != volume_info->snapshot_time)
+ continue;
+#endif /* CONFIG_CIFS_SMB2 */
+
++tcon->tc_count;
spin_unlock(&cifs_tcp_ses_lock);
return tcon;
@@ -2604,7 +2648,7 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc)
return NULL;
}
-static void
+void
cifs_put_tcon(struct cifs_tcon *tcon)
{
unsigned int xid;
@@ -2636,7 +2680,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
int rc, xid;
struct cifs_tcon *tcon;
- tcon = cifs_find_tcon(ses, volume_info->UNC);
+ tcon = cifs_find_tcon(ses, volume_info);
if (tcon) {
cifs_dbg(FYI, "Found match on UNC path\n");
/* existing tcon already has a reference */
@@ -2657,6 +2701,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
goto out_fail;
}
+ if (volume_info->snapshot_time) {
+#ifdef CONFIG_CIFS_SMB2
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "Use SMB2 or later for snapshot mount option\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ } else
+ tcon->snapshot_time = volume_info->snapshot_time;
+#else
+ cifs_dbg(VFS, "Snapshot mount option requires SMB2 support\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+ }
+
tcon->ses = ses;
if (volume_info->password) {
tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
@@ -3706,7 +3766,8 @@ remote_path_check:
/*
* cifs_build_path_to_root works only when we have a valid tcon
*/
- full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
+ full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon,
+ tcon->Flags & SMB_SHARE_IS_IN_DFS);
if (full_path == NULL) {
rc = -ENOMEM;
goto mount_fail_check;
@@ -3792,7 +3853,7 @@ mount_fail_check:
else if (ses)
cifs_put_smb_ses(ses);
else
- cifs_put_tcp_session(server);
+ cifs_put_tcp_session(server, 0);
bdi_destroy(&cifs_sb->bdi);
}
@@ -4103,7 +4164,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
if (IS_ERR(ses)) {
tcon = (struct cifs_tcon *)ses;
- cifs_put_tcp_session(master_tcon->ses->server);
+ cifs_put_tcp_session(master_tcon->ses->server, 0);
goto out;
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 789ff1df2d8d..2c227a99f369 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -47,7 +47,7 @@ renew_parental_timestamps(struct dentry *direntry)
char *
cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon)
+ struct cifs_tcon *tcon, int add_treename)
{
int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
int dfsplen;
@@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
return full_path;
}
- if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+ if (add_treename)
dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7f5f6176c6f1..18a1e1d6671f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -777,6 +777,11 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
struct list_head *tmp1;
struct list_head tmp_list;
+ if (!tcon->use_persistent || !tcon->need_reopen_files)
+ return;
+
+ tcon->need_reopen_files = false;
+
cifs_dbg(FYI, "Reopen persistent handles");
INIT_LIST_HEAD(&tmp_list);
@@ -793,7 +798,8 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
list_for_each_safe(tmp, tmp1, &tmp_list) {
open_file = list_entry(tmp, struct cifsFileInfo, rlist);
- cifs_reopen_file(open_file, false /* do not flush */);
+ if (cifs_reopen_file(open_file, false /* do not flush */))
+ tcon->need_reopen_files = true;
list_del_init(&open_file->rlist);
cifsFileInfo_put(open_file);
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9f51b81119f2..001528781b6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
cifs_sb = CIFS_SB(inode->i_sb);
- cifs_dbg(VFS, "cifs ioctl 0x%x\n", command);
+ cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
switch (command) {
case FS_IOC_GETFLAGS:
if (pSMBFile == NULL)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index d031af8d3d4d..c4d996f78e1c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -45,13 +45,8 @@
(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
#define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
-#define CIFS_MF_SYMLINK_MD5_FORMAT \
- "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
-#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
- md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \
- md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \
- md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\
- md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+#define CIFS_MF_SYMLINK_MD5_FORMAT "%16phN\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash
static int
symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8f6a2a5863b9..a27fc8791551 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -285,6 +285,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
rc = -ENOMEM;
goto error_exit;
}
+ spin_lock_init(&cifsFile->file_info_lock);
file->private_data = cifsFile;
cifsFile->tlink = cifs_get_tlink(tlink);
tcon = tlink_tcon(tlink);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index f9e766f464be..b2aff0c6f22c 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,7 +260,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
* and check it for zero before using.
*/
max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
- if (!max_buf) {
+ if (max_buf < sizeof(struct smb2_lock_element)) {
free_xid(xid);
return -EINVAL;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5ca5ea4668a1..87457227812c 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -250,16 +250,19 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
}
cifs_mark_open_files_invalid(tcon);
+ if (tcon->use_persistent)
+ tcon->need_reopen_files = true;
rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
mutex_unlock(&tcon->ses->session_mutex);
- if (tcon->use_persistent)
- cifs_reopen_persistent_handles(tcon);
-
cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
if (rc)
goto out;
+
+ if (smb2_command != SMB2_INTERNAL_CMD)
+ queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+
atomic_inc(&tconInfoReconnectCount);
out:
/*
@@ -280,7 +283,7 @@ out:
case SMB2_CHANGE_NOTIFY:
case SMB2_QUERY_INFO:
case SMB2_SET_INFO:
- return -EAGAIN;
+ rc = -EAGAIN;
}
unload_nls(nls_codepage);
return rc;
@@ -1972,6 +1975,55 @@ smb2_echo_callback(struct mid_q_entry *mid)
add_credits(server, credits_received, CIFS_ECHO_OP);
}
+void smb2_reconnect_server(struct work_struct *work)
+{
+ struct TCP_Server_Info *server = container_of(work,
+ struct TCP_Server_Info, reconnect.work);
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon, *tcon2;
+ struct list_head tmp_list;
+ int tcon_exist = false;
+
+ /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
+ mutex_lock(&server->reconnect_mutex);
+
+ INIT_LIST_HEAD(&tmp_list);
+ cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->need_reconnect || tcon->need_reopen_files) {
+ tcon->tc_count++;
+ list_add_tail(&tcon->rlist, &tmp_list);
+ tcon_exist = true;
+ }
+ }
+ }
+ /*
+ * Get the reference to server struct to be sure that the last call of
+ * cifs_put_tcon() in the loop below won't release the server pointer.
+ */
+ if (tcon_exist)
+ server->srv_count++;
+
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
+ if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon))
+ cifs_reopen_persistent_handles(tcon);
+ list_del_init(&tcon->rlist);
+ cifs_put_tcon(tcon);
+ }
+
+ cifs_dbg(FYI, "Reconnecting tcons finished\n");
+ mutex_unlock(&server->reconnect_mutex);
+
+ /* now we can safely release srv struct */
+ if (tcon_exist)
+ cifs_put_tcp_session(server, 1);
+}
+
int
SMB2_echo(struct TCP_Server_Info *server)
{
@@ -1984,32 +2036,11 @@ SMB2_echo(struct TCP_Server_Info *server)
cifs_dbg(FYI, "In echo request\n");
if (server->tcpStatus == CifsNeedNegotiate) {
- struct list_head *tmp, *tmp2;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
-
- cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
- spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp2, &ses->tcon_list) {
- tcon = list_entry(tmp2, struct cifs_tcon,
- tcon_list);
- /* add check for persistent handle reconnect */
- if (tcon && tcon->need_reconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
- rc = smb2_reconnect(SMB2_ECHO, tcon);
- spin_lock(&cifs_tcp_ses_lock);
- }
- }
- }
- spin_unlock(&cifs_tcp_ses_lock);
+ /* No need to send echo on newly established connections */
+ queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ return rc;
}
- /* if no session, renegotiate failed above */
- if (server->tcpStatus == CifsNeedNegotiate)
- return -EIO;
-
rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
if (rc)
return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index fd3709e8de33..dc0d141f33e2 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -80,6 +80,8 @@
#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE)
#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF)
+
#define NUMBER_OF_SMB2_COMMANDS 0x0013
/* BB FIXME - analyze following length BB */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index eb2cde2f64ba..f2d511a6971b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -96,6 +96,7 @@ extern int smb2_open_file(const unsigned int xid,
extern int smb2_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void smb2_reconnect_server(struct work_struct *work);
/*
* SMB2 Worker functions - most of protocol specific implementation details
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 699b7868108f..c12bffefa3c9 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,7 +23,7 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <crypto/skcipher.h>
+#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -69,46 +69,22 @@ str_to_key(unsigned char *str, unsigned char *key)
static int
smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
{
- int rc;
unsigned char key2[8];
- struct crypto_skcipher *tfm_des;
- struct scatterlist sgin, sgout;
- struct skcipher_request *req;
+ struct crypto_cipher *tfm_des;
str_to_key(key, key2);
- tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+ tfm_des = crypto_alloc_cipher("des", 0, 0);
if (IS_ERR(tfm_des)) {
- rc = PTR_ERR(tfm_des);
- cifs_dbg(VFS, "could not allocate des crypto API\n");
- goto smbhash_err;
- }
-
- req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
- if (!req) {
- rc = -ENOMEM;
cifs_dbg(VFS, "could not allocate des crypto API\n");
- goto smbhash_free_skcipher;
+ return PTR_ERR(tfm_des);
}
- crypto_skcipher_setkey(tfm_des, key2, 8);
-
- sg_init_one(&sgin, in, 8);
- sg_init_one(&sgout, out, 8);
+ crypto_cipher_setkey(tfm_des, key2, 8);
+ crypto_cipher_encrypt_one(tfm_des, out, in);
+ crypto_free_cipher(tfm_des);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
-
- rc = crypto_skcipher_encrypt(req);
- if (rc)
- cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
-
- skcipher_request_free(req);
-
-smbhash_free_skcipher:
- crypto_free_skcipher(tfm_des);
-smbhash_err:
- return rc;
+ return 0;
}
static int
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 206a597b2293..fbb84c08e3cd 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -28,8 +28,9 @@
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/tcp.h>
+#include <linux/bvec.h>
#include <linux/highmem.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/mempool.h>
#include "cifspdu.h"
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 1bfb7ba4e85e..f13e09057c6b 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -17,7 +17,6 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
}
static const struct inode_operations coda_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = coda_setattr,
};
diff --git a/fs/compat.c b/fs/compat.c
index bd064a2c3550..e50a2114f474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,7 +49,7 @@
#include <linux/pagemap.h>
#include <linux/aio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/ioctls.h>
#include "internal.h"
@@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
- if (sizeof ubuf->f_blocks == 4) {
- if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
- kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ if (sizeof(ubuf->f_bsize) == 4) {
+ if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+ kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
return -EOVERFLOW;
/* f_files and f_ffree may be -1; it's okay
* to stuff that into 32 bits */
@@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
return compat_sys_fcntl64(fd, cmd, arg);
}
-COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
-{
- long ret;
- aio_context_t ctx64;
-
- mm_segment_t oldfs = get_fs();
- if (unlikely(get_user(ctx64, ctx32p)))
- return -EFAULT;
-
- set_fs(KERNEL_DS);
- /* The __user pointer cast is valid because of the set_fs() */
- ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64);
- set_fs(oldfs);
- /* truncating is ok because it's a user address */
- if (!ret)
- ret = put_user((u32) ctx64, ctx32p);
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct compat_timespec __user *, timeout)
-{
- struct timespec t;
- struct timespec __user *ut = NULL;
-
- if (timeout) {
- if (compat_get_timespec(&t, timeout))
- return -EFAULT;
-
- ut = compat_alloc_user_space(sizeof(*ut));
- if (copy_to_user(ut, &t, sizeof(t)) )
- return -EFAULT;
- }
- return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-}
-
/* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
@@ -602,42 +563,6 @@ out:
return ret;
}
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
-{
- compat_uptr_t uptr;
- int i;
-
- for (i = 0; i < nr; ++i) {
- if (get_user(uptr, ptr32 + i))
- return -EFAULT;
- if (put_user(compat_ptr(uptr), ptr64 + i))
- return -EFAULT;
- }
- return 0;
-}
-
-#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
-
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
- int, nr, u32 __user *, iocb)
-{
- struct iocb __user * __user *iocb64;
- long ret;
-
- if (unlikely(nr < 0))
- return -EINVAL;
-
- if (nr > MAX_AIO_SUBMITS)
- nr = MAX_AIO_SUBMITS;
-
- iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
- ret = copy_iocb(nr, iocb, iocb64);
- if (!ret)
- ret = do_io_submit(ctx_id, nr, iocb64, 1);
- return ret;
-}
-
struct compat_ncp_mount_data {
compat_int_t version;
compat_uint_t ncp_fd;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f2d7402abe02..11d087b2b28e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -76,7 +76,7 @@
#include <scsi/sg.h>
#endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/ethtool.h>
#include <linux/mii.h>
#include <linux/if_bonding.h>
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2c6312db8516..39da1103d341 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -29,7 +29,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index db6d69289608..a6ab012a2c6a 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -305,7 +305,6 @@ static const char *configfs_get_link(struct dentry *dentry,
const struct inode_operations configfs_symlink_inode_operations = {
.get_link = configfs_get_link,
- .readlink = generic_readlink,
.setattr = configfs_setattr,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index eb9c92c9b20f..ae6b05629ca1 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -38,7 +38,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include <asm/exec.h>
@@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align)
return mod ? dump_skip(cprm, align - mod) : 1;
}
EXPORT_SYMBOL(dump_align);
+
+/*
+ * Ensures that file size is big enough to contain the current file
+ * postion. This prevents gdb from complaining about a truncated file
+ * if the last "write" to the file was dump_skip.
+ */
+void dump_truncate(struct coredump_params *cprm)
+{
+ struct file *file = cprm->file;
+ loff_t offset;
+
+ if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ offset = file->f_op->llseek(file, 0, SEEK_CUR);
+ if (i_size_read(file->f_mapping->host) < offset)
+ do_truncate(file->f_path.dentry, offset, 0, file);
+ }
+}
+EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 92348faf9865..f514978f6688 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -8,9 +8,7 @@ config FS_ENCRYPTION
select CRYPTO_XTS
select CRYPTO_CTS
select CRYPTO_CTR
- select CRYPTO_SHA256
select KEYS
- select ENCRYPTED_KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 98f87fe8f186..ac8e4f6a3773 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,7 +27,7 @@
#include <linux/bio.h>
#include <linux/dcache.h>
#include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
{
unsigned long flags;
- if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
ctx->w.bounce_page = NULL;
}
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
* Return: An allocated and initialized encryption context on success; error
* value or NULL otherwise.
*/
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
{
struct fscrypt_ctx *ctx = NULL;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
} else {
ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
}
- ctx->flags &= ~FS_WRITE_PATH_FL;
+ ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx;
}
EXPORT_SYMBOL(fscrypt_get_ctx);
@@ -146,9 +146,10 @@ typedef enum {
FS_ENCRYPT,
} fscrypt_direction_t;
-static int do_page_crypto(struct inode *inode,
- fscrypt_direction_t rw, pgoff_t index,
+static int do_page_crypto(const struct inode *inode,
+ fscrypt_direction_t rw, u64 lblk_num,
struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
gfp_t gfp_flags)
{
struct {
@@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode,
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
+ BUG_ON(len == 0);
+
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
@@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode,
page_crypt_complete, &ecr);
BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
- xts_tweak.index = cpu_to_le64(index);
+ xts_tweak.index = cpu_to_le64(lblk_num);
memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+ sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+ sg_set_page(&src, src_page, len, offs);
+ skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
else
@@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
if (ctx->w.bounce_page == NULL)
return ERR_PTR(-ENOMEM);
- ctx->flags |= FS_WRITE_PATH_FL;
+ ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx->w.bounce_page;
}
/**
* fscypt_encrypt_page() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags: The gfp flag for memory allocation
+ * @inode: The inode for which the encryption should take place
+ * @page: The page to encrypt. Must be locked for bounce-page
+ * encryption.
+ * @len: Length of data to encrypt in @page and encrypted
+ * data in returned page.
+ * @offs: Offset of data within @page and returned
+ * page holding encrypted data.
+ * @lblk_num: Logical block number. This must be unique for multiple
+ * calls with same inode, except when overwriting
+ * previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
*
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
*
- * Called on the page write path. The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
* fscrypt_restore_control_page() on the returned ciphertext page to
* release the bounce buffer and the encryption context.
*
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
* error value or NULL.
*/
-struct page *fscrypt_encrypt_page(struct inode *inode,
- struct page *plaintext_page, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+
{
struct fscrypt_ctx *ctx;
- struct page *ciphertext_page = NULL;
+ struct page *ciphertext_page = page;
int err;
- BUG_ON(!PageLocked(plaintext_page));
+ BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+ if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+ /* with inplace-encryption we just encrypt the page */
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page,
+ len, offs, gfp_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ return ciphertext_page;
+ }
+
+ BUG_ON(!PageLocked(page));
ctx = fscrypt_get_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
@@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
if (IS_ERR(ciphertext_page))
goto errout;
- ctx->w.control_page = plaintext_page;
- err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page,
- gfp_flags);
+ ctx->w.control_page = page;
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page,
+ len, offs, gfp_flags);
if (err) {
ciphertext_page = ERR_PTR(err);
goto errout;
@@ -265,8 +300,13 @@ errout:
EXPORT_SYMBOL(fscrypt_encrypt_page);
/**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode: The corresponding inode for the page to decrypt.
+ * @page: The page to decrypt. Must be locked in case
+ * it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len: Number of bytes in @page to be decrypted.
+ * @offs: Start of data in @page.
+ * @lblk_num: Logical block number.
*
* Decrypts page in-place using the ctx encryption context.
*
@@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+ unsigned int len, unsigned int offs, u64 lblk_num)
{
- BUG_ON(!PageLocked(page));
+ if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+ BUG_ON(!PageLocked(page));
- return do_page_crypto(page->mapping->host,
- FS_DECRYPT, page->index, page, page, GFP_NOFS);
+ return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
+ offs, GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_page);
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
struct fscrypt_ctx *ctx;
@@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
while (len--) {
err = do_page_crypto(inode, FS_ENCRYPT, lblk,
ZERO_PAGE(0), ciphertext_page,
- GFP_NOFS);
+ PAGE_SIZE, 0, GFP_NOFS);
if (err)
goto errout;
@@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- int ret = fscrypt_decrypt_page(page);
+ int ret = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
if (ret) {
WARN_ON_ONCE(1);
@@ -482,17 +525,22 @@ static void fscrypt_destroy(void)
/**
* fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags: fscrypt operations flags
*
* We only call this when we start accessing encrypted files, since it
* results in memory getting allocated that wouldn't otherwise be used.
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
{
int i, res = -ENOMEM;
- if (fscrypt_bounce_page_pool)
+ /*
+ * No need to allocate a bounce page pool if there already is one or
+ * this FS won't use it.
+ */
+ if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
return 0;
mutex_lock(&fscrypt_init_mutex);
@@ -521,7 +569,6 @@ fail:
mutex_unlock(&fscrypt_init_mutex);
return res;
}
-EXPORT_SYMBOL(fscrypt_initialize);
/**
* fscrypt_init() - Set up for fs encryption.
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 9b774f4b50c8..56ad9d195f18 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,7 +12,7 @@
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
/**
* fname_crypt_complete() - completion callback for filename crypto
@@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst)
return cp - dst;
}
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
{
int padding = 32;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
* Allocates an output buffer that is sufficient for the crypto operation
* specified by the context and the direction.
*/
-int fscrypt_fname_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
u32 ilen, struct fscrypt_str *crypto_str)
{
unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
@@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
fname->disk_name.len = iname->len;
return 0;
}
- ret = get_crypt_info(dir);
+ ret = fscrypt_get_crypt_info(dir);
if (ret && ret != -EOPNOTSUPP)
return ret;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..aeab032d7d35
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,93 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypto.h>
+
+#define FS_FNAME_CRYPTO_DIGEST_SIZE 32
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE 16
+#define FS_AES_128_ECB_KEY_SIZE 16
+#define FS_AES_256_GCM_KEY_SIZE 32
+#define FS_AES_256_CBC_KEY_SIZE 32
+#define FS_AES_256_CTS_KEY_SIZE 32
+#define FS_AES_256_XTS_KEY_SIZE 64
+#define FS_MAX_KEY_SIZE 64
+
+#define FS_KEY_DESC_PREFIX "fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE 8
+
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Flags
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+ u8 format;
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+ u32 mode;
+ u8 raw[FS_MAX_KEY_SIZE];
+ u32 size;
+} __packed;
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+ u8 ci_data_mode;
+ u8 ci_filename_mode;
+ u8 ci_flags;
+ struct crypto_skcipher *ci_ctfm;
+ struct key *ci_keyring_key;
+ u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002
+
+struct fscrypt_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+ struct fscrypt_completion_result ecr = { \
+ COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+
+/* crypto.c */
+int fscrypt_initialize(unsigned int cop_flags);
+
+/* keyinfo.c */
+extern int fscrypt_get_crypt_info(struct inode *);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 67fb6d8876d0..95cd4c3b06c3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,7 +10,7 @@
#include <keys/user-type.h>
#include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
static void derive_crypt_complete(struct crypto_async_request *req, int rc)
{
@@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int get_crypt_info(struct inode *inode)
+int fscrypt_get_crypt_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
@@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode)
u8 *raw_key = NULL;
int res;
- res = fscrypt_initialize();
+ res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
@@ -248,7 +248,8 @@ retry:
goto out;
if (fscrypt_dummy_context_enabled(inode)) {
- memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+ memset(raw_key, 0x42, keysize/2);
+ memset(raw_key+keysize/2, 0x24, keysize - (keysize/2));
goto got_key;
}
@@ -327,7 +328,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
(ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
(1 << KEY_FLAG_REVOKED) |
(1 << KEY_FLAG_DEAD)))))
- return get_crypt_info(inode);
+ return fscrypt_get_crypt_info(inode);
return 0;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6865663aac69..d6cd7ea4851d 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,8 +10,8 @@
#include <linux/random.h>
#include <linux/string.h>
-#include <linux/fscrypto.h>
#include <linux/mount.h>
+#include "fscrypt_private.h"
static int inode_has_encryption_context(struct inode *inode)
{
@@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode,
return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
}
-int fscrypt_process_policy(struct file *filp,
- const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
{
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
int ret;
+ if (copy_from_user(&policy, arg, sizeof(policy)))
+ return -EFAULT;
+
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (policy->version != 0)
+ if (policy.version != 0)
return -EINVAL;
ret = mnt_want_write_file(filp);
@@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp,
ret = -ENOTEMPTY;
else
ret = create_encryption_context_from_policy(inode,
- policy);
+ &policy);
} else if (!is_encryption_context_consistent_with_policy(inode,
- policy)) {
+ &policy)) {
printk(KERN_WARNING
"%s: Policy inconsistent with encryption context\n",
__func__);
@@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp,
mnt_drop_write_file(filp);
return ret;
}
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
+ struct inode *inode = file_inode(filp);
struct fscrypt_context ctx;
+ struct fscrypt_policy policy;
int res;
if (!inode->i_sb->s_cop->get_context ||
@@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
return -EINVAL;
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ policy.version = 0;
+ policy.contents_encryption_mode = ctx.contents_encryption_mode;
+ policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy.flags = ctx.flags;
+ memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
+
+ if (copy_to_user(arg, &policy, sizeof(policy)))
+ return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
@@ -171,6 +179,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
BUG_ON(1);
}
+ /* No restrictions on file types which are never encrypted */
+ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+ !S_ISLNK(child->i_mode))
+ return 1;
+
/* no restrictions if the parent directory is not encrypted */
if (!parent->i_sb->s_cop->is_encrypted(parent))
return 1;
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..c45598b912e1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,28 +31,15 @@
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include "internal.h"
-/*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
- */
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
- RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
- RADIX_TREE_EXCEPTIONAL_ENTRY))
-
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
-wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
{
@@ -64,14 +51,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index)
-{
- unsigned long hash = hash_long((unsigned long)mapping ^ index,
- DAX_WAIT_TABLE_BITS);
- return wait_table + hash;
-}
-
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
@@ -98,209 +77,52 @@ static void dax_unmap_atomic(struct block_device *bdev,
blk_queue_exit(bdev->bd_queue);
}
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+static int dax_is_pmd_entry(void *entry)
{
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
- };
- long rc;
-
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- rc = dax_map_atomic(bdev, &dax);
- if (rc < 0)
- return ERR_PTR(rc);
- memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
- dax_unmap_atomic(bdev, &dax);
- return page;
+ return (unsigned long)entry & RADIX_DAX_PMD;
}
-static bool buffer_written(struct buffer_head *bh)
+static int dax_is_pte_entry(void *entry)
{
- return buffer_mapped(bh) && !buffer_unwritten(bh);
+ return !((unsigned long)entry & RADIX_DAX_PMD);
}
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size. To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
+static int dax_is_zero_entry(void *entry)
{
- return bh->b_state != 0;
+ return (unsigned long)entry & RADIX_DAX_HZP;
}
-
-static sector_t to_sector(const struct buffer_head *bh,
- const struct inode *inode)
+static int dax_is_empty_entry(void *entry)
{
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
- return sector;
+ return (unsigned long)entry & RADIX_DAX_EMPTY;
}
-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
- loff_t start, loff_t end, get_block_t get_block,
- struct buffer_head *bh)
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
{
- loff_t pos = start, max = start, bh_max = start;
- bool hole = false;
- struct block_device *bdev = NULL;
- int rw = iov_iter_rw(iter), rc;
- long map_len = 0;
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
struct blk_dax_ctl dax = {
- .addr = ERR_PTR(-EIO),
+ .size = PAGE_SIZE,
+ .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
};
- unsigned blkbits = inode->i_blkbits;
- sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
- >> blkbits;
-
- if (rw == READ)
- end = min(end, i_size_read(inode));
-
- while (pos < end) {
- size_t len;
- if (pos == max) {
- long page = pos >> PAGE_SHIFT;
- sector_t block = page << (PAGE_SHIFT - blkbits);
- unsigned first = pos - (block << blkbits);
- long size;
-
- if (pos == bh_max) {
- bh->b_size = PAGE_ALIGN(end - pos);
- bh->b_state = 0;
- rc = get_block(inode, block, bh, rw == WRITE);
- if (rc)
- break;
- if (!buffer_size_valid(bh))
- bh->b_size = 1 << blkbits;
- bh_max = pos - first + bh->b_size;
- bdev = bh->b_bdev;
- /*
- * We allow uninitialized buffers for writes
- * beyond EOF as those cannot race with faults
- */
- WARN_ON_ONCE(
- (buffer_new(bh) && block < file_blks) ||
- (rw == WRITE && buffer_unwritten(bh)));
- } else {
- unsigned done = bh->b_size -
- (bh_max - (pos - first));
- bh->b_blocknr += done >> blkbits;
- bh->b_size -= done;
- }
-
- hole = rw == READ && !buffer_written(bh);
- if (hole) {
- size = bh->b_size - first;
- } else {
- dax_unmap_atomic(bdev, &dax);
- dax.sector = to_sector(bh, inode);
- dax.size = bh->b_size;
- map_len = dax_map_atomic(bdev, &dax);
- if (map_len < 0) {
- rc = map_len;
- break;
- }
- dax.addr += first;
- size = map_len - first;
- }
- /*
- * pos + size is one past the last offset for IO,
- * so pos + size can overflow loff_t at extreme offsets.
- * Cast to u64 to catch this and get the true minimum.
- */
- max = min_t(u64, pos + size, end);
- }
-
- if (iov_iter_rw(iter) == WRITE) {
- len = copy_from_iter_pmem(dax.addr, max - pos, iter);
- } else if (!hole)
- len = copy_to_iter((void __force *) dax.addr, max - pos,
- iter);
- else
- len = iov_iter_zero(max - pos, iter);
-
- if (!len) {
- rc = -EFAULT;
- break;
- }
+ long rc;
- pos += len;
- if (!IS_ERR(dax.addr))
- dax.addr += len;
- }
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ rc = dax_map_atomic(bdev, &dax);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
dax_unmap_atomic(bdev, &dax);
-
- return (pos == start) ? rc : pos - start;
-}
-
-/**
- * dax_do_io - Perform I/O to a DAX file
- * @iocb: The control block for this I/O
- * @inode: The file which the I/O is directed at
- * @iter: The addresses to do I/O from or to
- * @get_block: The filesystem method used to translate file offsets to blocks
- * @end_io: A filesystem callback for I/O completion
- * @flags: See below
- *
- * This function uses the same locking scheme as do_blockdev_direct_IO:
- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
- * caller for writes. For reads, we take and release the i_mutex ourselves.
- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
- * is in progress.
- */
-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
- struct iov_iter *iter, get_block_t get_block,
- dio_iodone_t end_io, int flags)
-{
- struct buffer_head bh;
- ssize_t retval = -EINVAL;
- loff_t pos = iocb->ki_pos;
- loff_t end = pos + iov_iter_count(iter);
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
-
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- inode_lock(inode);
-
- /* Protects against truncate */
- if (!(flags & DIO_SKIP_DIO_COUNT))
- inode_dio_begin(inode);
-
- retval = dax_io(inode, iter, pos, end, get_block, &bh);
-
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- inode_unlock(inode);
-
- if (end_io) {
- int err;
-
- err = end_io(iocb, pos, retval, bh.b_private);
- if (err)
- retval = err;
- }
-
- if (!(flags & DIO_SKIP_DIO_COUNT))
- inode_dio_end(inode);
- return retval;
+ return page;
}
-EXPORT_SYMBOL_GPL(dax_do_io);
/*
* DAX radix tree locking
*/
struct exceptional_entry_key {
struct address_space *mapping;
- unsigned long index;
+ pgoff_t entry_start;
};
struct wait_exceptional_entry_queue {
@@ -308,6 +130,26 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+ pgoff_t index, void *entry, struct exceptional_entry_key *key)
+{
+ unsigned long hash;
+
+ /*
+ * If 'entry' is a PMD, align the 'index' that we use for the wait
+ * queue to the start of that PMD. This ensures that all offsets in
+ * the range covered by the PMD map to the same bit lock.
+ */
+ if (dax_is_pmd_entry(entry))
+ index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+
+ key->mapping = mapping;
+ key->entry_start = index;
+
+ hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ return wait_table + hash;
+}
+
static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
int sync, void *keyp)
{
@@ -316,7 +158,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
container_of(wait, struct wait_exceptional_entry_queue, wait);
if (key->mapping != ewait->key.mapping ||
- key->index != ewait->key.index)
+ key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
}
@@ -342,7 +184,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@@ -356,7 +198,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@@ -372,24 +214,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
{
- void *ret, **slot;
+ void *entry, **slot;
struct wait_exceptional_entry_queue ewait;
- wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+ wait_queue_head_t *wq;
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
- ewait.key.mapping = mapping;
- ewait.key.index = index;
for (;;) {
- ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!ret || !radix_tree_exceptional_entry(ret) ||
+ if (!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
- return ret;
+ return entry;
}
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mapping->tree_lock);
@@ -399,52 +241,173 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
}
}
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+ pgoff_t index)
+{
+ void *entry, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry)) {
+ unlock_page(entry);
+ put_page(entry);
+ } else {
+ dax_unlock_mapping_entry(mapping, index);
+ }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry))
+ return;
+
+ /* We have to wake up next waiter for the radix tree entry lock */
+ dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
/*
* Find radix tree entry at given index. If it points to a page, return with
* the page locked. If it points to the exceptional entry, return with the
* radix tree entry locked. If the radix tree doesn't contain given index,
* create empty exceptional entry for the index and return with it locked.
*
+ * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return an error. This error will
+ * happen if there are any 4k entries (either zero pages or DAX entries)
+ * within the 2MiB range that we are requesting.
+ *
+ * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+ * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
+ * insertion will fail if it finds any 4k entries already in the tree, and a
+ * 4k insertion will cause an existing 2MiB entry to be unmapped and
+ * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
+ * well as 2MiB empty entries.
+ *
+ * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+ * real storage backing them. We will leave these real 2MiB DAX entries in
+ * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ *
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+ unsigned long size_flag)
{
- void *ret, **slot;
+ bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
+ void *entry, **slot;
restart:
spin_lock_irq(&mapping->tree_lock);
- ret = get_unlocked_mapping_entry(mapping, index, &slot);
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+
+ if (entry) {
+ if (size_flag & RADIX_DAX_PMD) {
+ if (!radix_tree_exceptional_entry(entry) ||
+ dax_is_pte_entry(entry)) {
+ put_unlocked_mapping_entry(mapping, index,
+ entry);
+ entry = ERR_PTR(-EEXIST);
+ goto out_unlock;
+ }
+ } else { /* trying to grab a PTE entry */
+ if (radix_tree_exceptional_entry(entry) &&
+ dax_is_pmd_entry(entry) &&
+ (dax_is_zero_entry(entry) ||
+ dax_is_empty_entry(entry))) {
+ pmd_downgrade = true;
+ }
+ }
+ }
+
/* No entry for given index? Make sure radix tree is big enough. */
- if (!ret) {
+ if (!entry || pmd_downgrade) {
int err;
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * mapping->tree_lock.
+ */
+ entry = lock_slot(mapping, slot);
+ }
+
spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * Besides huge zero pages the only other thing that gets
+ * downgraded are empty entries which don't need to be
+ * unmapped.
+ */
+ if (pmd_downgrade && dax_is_zero_entry(entry))
+ unmap_mapping_range(mapping,
+ (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+
err = radix_tree_preload(
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err)
+ if (err) {
+ if (pmd_downgrade)
+ put_locked_mapping_entry(mapping, index, entry);
return ERR_PTR(err);
- ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
- RADIX_DAX_ENTRY_LOCK);
+ }
spin_lock_irq(&mapping->tree_lock);
- err = radix_tree_insert(&mapping->page_tree, index, ret);
+
+ if (pmd_downgrade) {
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ dax_wake_mapping_entry_waiter(mapping, index, entry,
+ true);
+ }
+
+ entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+
+ err = __radix_tree_insert(&mapping->page_tree, index,
+ dax_radix_order(entry), entry);
radix_tree_preload_end();
if (err) {
spin_unlock_irq(&mapping->tree_lock);
- /* Someone already created the entry? */
- if (err == -EEXIST)
+ /*
+ * Someone already created the entry? This is a
+ * normal failure when inserting PMDs in a range
+ * that already contains PTEs. In that case we want
+ * to return -EEXIST immediately.
+ */
+ if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
goto restart;
+ /*
+ * Our insertion of a DAX PMD entry failed, most
+ * likely because it collided with a PTE sized entry
+ * at a different index in the PMD range. We haven't
+ * inserted anything into the radix tree and have no
+ * waiters to wake.
+ */
return ERR_PTR(err);
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
spin_unlock_irq(&mapping->tree_lock);
- return ret;
+ return entry;
}
/* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(ret)) {
- struct page *page = ret;
+ if (!radix_tree_exceptional_entry(entry)) {
+ struct page *page = entry;
get_page(page);
spin_unlock_irq(&mapping->tree_lock);
@@ -457,15 +420,26 @@ restart:
}
return page;
}
- ret = lock_slot(mapping, slot);
+ entry = lock_slot(mapping, slot);
+ out_unlock:
spin_unlock_irq(&mapping->tree_lock);
- return ret;
+ return entry;
}
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree. This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, bool wake_all)
+ pgoff_t index, void *entry, bool wake_all)
{
- wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -473,66 +447,41 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
- if (waitqueue_active(wq)) {
- struct exceptional_entry_key key;
-
- key.mapping = mapping;
- key.index = index;
+ if (waitqueue_active(wq))
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
- }
}
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+ pgoff_t index, bool trunc)
{
- void *ret, **slot;
+ int ret = 0;
+ void *entry;
+ struct radix_tree_root *page_tree = &mapping->page_tree;
spin_lock_irq(&mapping->tree_lock);
- ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
- if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
- !slot_locked(mapping, slot))) {
- spin_unlock_irq(&mapping->tree_lock);
- return;
- }
- unlock_slot(mapping, slot);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (!entry || !radix_tree_exceptional_entry(entry))
+ goto out;
+ if (!trunc &&
+ (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+ radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+ goto out;
+ radix_tree_delete(page_tree, index);
+ mapping->nrexceptional--;
+ ret = 1;
+out:
+ put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
- dax_wake_mapping_entry_waiter(mapping, index, false);
-}
-
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
-}
-
-/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
- */
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- if (!radix_tree_exceptional_entry(entry))
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, false);
+ return ret;
}
-
/*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- void *entry;
+ int ret = __dax_invalidate_mapping_entry(mapping, index, true);
- spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
@@ -540,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* caller has seen exceptional entry for this index, we better find it
* at that index as well...
*/
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
- spin_unlock_irq(&mapping->tree_lock);
- return 0;
- }
- radix_tree_delete(&mapping->page_tree, index);
+ WARN_ON_ONCE(!ret);
+ return ret;
+}
+
+/*
+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
+ * entries for invalidate_inode_pages() so we evict the entry only if we can
+ * do so without blocking.
+ */
+int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ int ret = 0;
+ void *entry, **slot;
+ struct radix_tree_root *page_tree = &mapping->page_tree;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ if (!entry || !radix_tree_exceptional_entry(entry) ||
+ slot_locked(mapping, slot))
+ goto out;
+ if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+ radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto out;
+ radix_tree_delete(page_tree, index);
mapping->nrexceptional--;
+ ret = 1;
+out:
spin_unlock_irq(&mapping->tree_lock);
- dax_wake_mapping_entry_waiter(mapping, index, true);
+ if (ret)
+ dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+ return ret;
+}
- return 1;
+/*
+ * Invalidate exceptional DAX entry if it is clean.
+ */
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+ pgoff_t index)
+{
+ return __dax_invalidate_mapping_entry(mapping, index, false);
}
/*
@@ -560,26 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied.
*/
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
struct page *page;
+ int ret;
/* Hole page already exists? Return it... */
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- return VM_FAULT_LOCKED;
+ if (!radix_tree_exceptional_entry(*entry)) {
+ page = *entry;
+ goto out;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ if (!page)
return VM_FAULT_OOM;
- }
+ out:
vmf->page = page;
- return VM_FAULT_LOCKED;
+ ret = finish_fault(vmf);
+ vmf->page = NULL;
+ *entry = page;
+ if (!ret) {
+ /* Grab reference for PTE that is now referencing the page */
+ get_page(page);
+ return VM_FAULT_NOPAGE;
+ }
+ return ret;
}
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -600,11 +587,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
return 0;
}
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs. If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
- void *entry, sector_t sector)
+ void *entry, sector_t sector,
+ unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
int error = 0;
@@ -627,28 +620,43 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
if (error)
return ERR_PTR(error);
+ } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
+ /* replacing huge zero page with PMD block mapping */
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
- new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
- RADIX_DAX_ENTRY_LOCK);
+ new_entry = dax_radix_locked_entry(sector, flags);
+
if (hole_fill) {
__delete_from_page_cache(entry, NULL);
/* Drop pagecache reference */
put_page(entry);
- error = radix_tree_insert(page_tree, index, new_entry);
+ error = __radix_tree_insert(page_tree, index,
+ dax_radix_order(new_entry), new_entry);
if (error) {
new_entry = ERR_PTR(error);
goto unlock;
}
mapping->nrexceptional++;
- } else {
+ } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ /*
+ * Only swap our new entry into the radix tree if the current
+ * entry is a zero page or an empty entry. If a normal PTE or
+ * PMD entry is already in the tree, we leave it alone. This
+ * means that if we are trying to insert a PTE and the
+ * existing entry is a PMD, we will just leave the PMD in the
+ * tree and dirty it if necessary.
+ */
+ struct radix_tree_node *node;
void **slot;
void *ret;
- ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- radix_tree_replace_slot(slot, new_entry);
+ __radix_tree_replace(page_tree, node, slot,
+ new_entry, NULL, NULL);
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -668,63 +676,171 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
return new_entry;
}
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+ unsigned long address;
+
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+ pgoff_t index, unsigned long pfn)
+{
+ struct vm_area_struct *vma;
+ pte_t pte, *ptep = NULL;
+ pmd_t *pmdp = NULL;
+ spinlock_t *ptl;
+ bool changed;
+
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+ unsigned long address;
+
+ cond_resched();
+
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ address = pgoff_address(index, vma);
+ changed = false;
+ if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
+ continue;
+
+ if (pmdp) {
+#ifdef CONFIG_FS_DAX_PMD
+ pmd_t pmd;
+
+ if (pfn != pmd_pfn(*pmdp))
+ goto unlock_pmd;
+ if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+ goto unlock_pmd;
+
+ flush_cache_page(vma, address, pfn);
+ pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_mkclean(pmd);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ changed = true;
+unlock_pmd:
+ spin_unlock(ptl);
+#endif
+ } else {
+ if (pfn != pte_pfn(*ptep))
+ goto unlock_pte;
+ if (!pte_dirty(*ptep) && !pte_write(*ptep))
+ goto unlock_pte;
+
+ flush_cache_page(vma, address, pfn);
+ pte = ptep_clear_flush(vma, address, ptep);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+ changed = true;
+unlock_pte:
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ if (changed)
+ mmu_notifier_invalidate_page(vma->vm_mm, address);
+ }
+ i_mmap_unlock_read(mapping);
+}
+
static int dax_writeback_one(struct block_device *bdev,
struct address_space *mapping, pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int type = RADIX_DAX_TYPE(entry);
- struct radix_tree_node *node;
struct blk_dax_ctl dax;
- void **slot;
+ void *entry2, **slot;
int ret = 0;
- spin_lock_irq(&mapping->tree_lock);
/*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
+ * A page got tagged dirty in DAX mapping? Something is seriously
+ * wrong.
*/
- if (!__radix_tree_lookup(page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
-
- /* another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto unlock;
+ if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ return -EIO;
- if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+ spin_lock_irq(&mapping->tree_lock);
+ entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* Entry got punched out / reallocated? */
+ if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback. We have to
+ * compare sectors as we must not bail out due to difference in lockbit
+ * or entry type.
+ */
+ if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
ret = -EIO;
- goto unlock;
+ goto put_unlocked;
}
- dax.sector = RADIX_DAX_SECTOR(entry);
- dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+ /* Another fsync thread may have already written back this entry */
+ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
+ /* Lock the entry to serialize with page faults */
+ entry = lock_slot(mapping, slot);
+ /*
+ * We can clear the tag now but we have to be careful so that concurrent
+ * dax_writeback_one() calls for the same index cannot finish before we
+ * actually flush the caches. This is achieved as the calls will look
+ * at the entry only under tree_lock and once they do that they will
+ * see the entry locked and wait for it to unlock.
+ */
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
spin_unlock_irq(&mapping->tree_lock);
/*
+ * Even if dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we are given will be aligned to
+ * the start index of the PMD, as will the sector we pull from
+ * 'entry'. This allows us to flush for PMD_SIZE and not have to
+ * worry about partial PMD writebacks.
+ */
+ dax.sector = dax_radix_sector(entry);
+ dax.size = PAGE_SIZE << dax_radix_order(entry);
+
+ /*
* We cannot hold tree_lock while calling dax_map_atomic() because it
* eventually calls cond_resched().
*/
ret = dax_map_atomic(bdev, &dax);
- if (ret < 0)
+ if (ret < 0) {
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
+ }
if (WARN_ON_ONCE(ret < dax.size)) {
ret = -EIO;
goto unmap;
}
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
wb_cache_pmem(dax.addr, dax.size);
-
+ /*
+ * After we have flushed the cache, we can clear the dirty tag. There
+ * cannot be new dirty data in the pfn after the flush has completed as
+ * the pfn mappings are writeprotected and fault waits for mapping
+ * entry lock.
+ */
spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
unmap:
dax_unmap_atomic(bdev, &dax);
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
- unlock:
+ put_unlocked:
+ put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
@@ -738,12 +854,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index, pmd_index;
+ pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
bool done = false;
int i, ret = 0;
- void *entry;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -753,15 +868,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
- pmd_index = DAX_PMD_INDEX(start_index);
-
- rcu_read_lock();
- entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
- rcu_read_unlock();
-
- /* see if the start of our range is covered by a PMD entry */
- if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
- start_index = pmd_index;
tag_pages_for_writeback(mapping, start_index, end_index);
@@ -794,7 +900,7 @@ static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, sector_t sector, size_t size,
void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
struct blk_dax_ctl dax = {
.sector = sector,
.size = size,
@@ -806,7 +912,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(dax.addr);
dax_unmap_atomic(bdev, &dax);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
*entryp = ret;
@@ -815,323 +921,6 @@ static int dax_insert_mapping(struct address_space *mapping,
}
/**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. dax_fault() assumes the caller has done all
- * the necessary locking for the page fault to proceed successfully.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- void *entry;
- struct buffer_head bh;
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
- unsigned blkbits = inode->i_blkbits;
- sector_t block;
- pgoff_t size;
- int error;
- int major = 0;
-
- /*
- * Check whether offset isn't beyond end of file now. Caller is supposed
- * to hold locks serializing us with truncate / punch hole so this is
- * a reliable test.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- return VM_FAULT_SIGBUS;
-
- memset(&bh, 0, sizeof(bh));
- block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
- bh.b_bdev = inode->i_sb->s_bdev;
- bh.b_size = PAGE_SIZE;
-
- entry = grab_mapping_entry(mapping, vmf->pgoff);
- if (IS_ERR(entry)) {
- error = PTR_ERR(entry);
- goto out;
- }
-
- error = get_block(inode, block, &bh, 0);
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO; /* fs corruption? */
- if (error)
- goto unlock_entry;
-
- if (vmf->cow_page) {
- struct page *new_page = vmf->cow_page;
- if (buffer_written(&bh))
- error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
- bh.b_size, new_page, vaddr);
- else
- clear_user_highpage(new_page, vaddr);
- if (error)
- goto unlock_entry;
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- return VM_FAULT_LOCKED;
- }
- vmf->entry = entry;
- return VM_FAULT_DAX_LOCKED;
- }
-
- if (!buffer_mapped(&bh)) {
- if (vmf->flags & FAULT_FLAG_WRITE) {
- error = get_block(inode, block, &bh, 1);
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO;
- if (error)
- goto unlock_entry;
- } else {
- return dax_load_hole(mapping, entry, vmf);
- }
- }
-
- /* Filesystem should not return unwritten buffers to us! */
- WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
- bh.b_size, &entry, vma, vmf);
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
- if (error == -ENOMEM)
- return VM_FAULT_OOM | major;
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if ((error < 0) && (error != -EBUSY))
- return VM_FAULT_SIGBUS | major;
- return VM_FAULT_NOPAGE | major;
-}
-EXPORT_SYMBOL_GPL(dax_fault);
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
-static void __dax_dbg(struct buffer_head *bh, unsigned long address,
- const char *reason, const char *fn)
-{
- if (bh) {
- char bname[BDEVNAME_SIZE];
- bdevname(bh->b_bdev, bname);
- pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
- "length %zd fallback: %s\n", fn, current->comm,
- address, bname, bh->b_state, (u64)bh->b_blocknr,
- bh->b_size, reason);
- } else {
- pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
- current->comm, address, reason);
- }
-}
-
-#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
-
-/**
- * dax_pmd_fault - handle a PMD fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * pmd_fault handler for DAX files.
- */
-int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct buffer_head bh;
- unsigned blkbits = inode->i_blkbits;
- unsigned long pmd_addr = address & PMD_MASK;
- bool write = flags & FAULT_FLAG_WRITE;
- struct block_device *bdev;
- pgoff_t size, pgoff;
- sector_t block;
- int result = 0;
- bool alloc = false;
-
- /* dax pmd mappings require pfn_t_devmap() */
- if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
- return VM_FAULT_FALLBACK;
-
- /* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED)) {
- split_huge_pmd(vma, pmd, address);
- dax_pmd_dbg(NULL, address, "cow write");
- return VM_FAULT_FALLBACK;
- }
- /* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start) {
- dax_pmd_dbg(NULL, address, "vma start unaligned");
- return VM_FAULT_FALLBACK;
- }
- if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
- dax_pmd_dbg(NULL, address, "vma end unaligned");
- return VM_FAULT_FALLBACK;
- }
-
- pgoff = linear_page_index(vma, pmd_addr);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size)
- return VM_FAULT_SIGBUS;
- /* If the PMD would cover blocks out of the file */
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(NULL, address,
- "offset + huge page size > file size");
- return VM_FAULT_FALLBACK;
- }
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
- block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
-
- bh.b_size = PMD_SIZE;
-
- if (get_block(inode, block, &bh, 0) != 0)
- return VM_FAULT_SIGBUS;
-
- if (!buffer_mapped(&bh) && write) {
- if (get_block(inode, block, &bh, 1) != 0)
- return VM_FAULT_SIGBUS;
- alloc = true;
- WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- }
-
- bdev = bh.b_bdev;
-
- /*
- * If the filesystem isn't willing to tell us the length of a hole,
- * just fall back to PTEs. Calling get_block 512 times in a loop
- * would be silly.
- */
- if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
- dax_pmd_dbg(&bh, address, "allocated block too small");
- return VM_FAULT_FALLBACK;
- }
-
- /*
- * If we allocated new storage, make sure no process has any
- * zero pages covering this hole
- */
- if (alloc) {
- loff_t lstart = pgoff << PAGE_SHIFT;
- loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
-
- truncate_pagecache_range(inode, lstart, lend);
- }
-
- if (!write && !buffer_mapped(&bh)) {
- spinlock_t *ptl;
- pmd_t entry;
- struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
-
- if (unlikely(!zero_page)) {
- dax_pmd_dbg(&bh, address, "no zero page");
- goto fallback;
- }
-
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_none(*pmd)) {
- spin_unlock(ptl);
- dax_pmd_dbg(&bh, address, "pmd already present");
- goto fallback;
- }
-
- dev_dbg(part_to_dev(bdev->bd_part),
- "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
- __func__, current->comm, address,
- (unsigned long long) to_sector(&bh, inode));
-
- entry = mk_pmd(zero_page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
- set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
- result = VM_FAULT_NOPAGE;
- spin_unlock(ptl);
- } else {
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PMD_SIZE,
- };
- long length = dax_map_atomic(bdev, &dax);
-
- if (length < 0) {
- dax_pmd_dbg(&bh, address, "dax-error fallback");
- goto fallback;
- }
- if (length < PMD_SIZE) {
- dax_pmd_dbg(&bh, address, "dax-length too small");
- dax_unmap_atomic(bdev, &dax);
- goto fallback;
- }
- if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
- dax_pmd_dbg(&bh, address, "pfn unaligned");
- dax_unmap_atomic(bdev, &dax);
- goto fallback;
- }
-
- if (!pfn_t_devmap(dax.pfn)) {
- dax_unmap_atomic(bdev, &dax);
- dax_pmd_dbg(&bh, address, "pfn not in memmap");
- goto fallback;
- }
- dax_unmap_atomic(bdev, &dax);
-
- /*
- * For PTE faults we insert a radix tree entry for reads, and
- * leave it clean. Then on the first write we dirty the radix
- * tree entry via the dax_pfn_mkwrite() path. This sequence
- * allows the dax_pfn_mkwrite() call to be simpler and avoid a
- * call into get_block() to translate the pgoff to a sector in
- * order to be able to create a new radix tree entry.
- *
- * The PMD path doesn't have an equivalent to
- * dax_pfn_mkwrite(), though, so for a read followed by a
- * write we traverse all the way through dax_pmd_fault()
- * twice. This means we can just skip inserting a radix tree
- * entry completely on the initial read and just wait until
- * the write to insert a dirty entry.
- */
- if (write) {
- /*
- * We should insert radix-tree entry and dirty it here.
- * For now this is broken...
- */
- }
-
- dev_dbg(part_to_dev(bdev->bd_part),
- "%s: %s addr: %lx pfn: %lx sect: %llx\n",
- __func__, current->comm, address,
- pfn_t_to_pfn(dax.pfn),
- (unsigned long long) dax.sector);
- result |= vmf_insert_pfn_pmd(vma, address, pmd,
- dax.pfn, write);
- }
-
- out:
- return result;
-
- fallback:
- count_vm_event(THP_FAULT_FALLBACK);
- result = VM_FAULT_FALLBACK;
- goto out;
-}
-EXPORT_SYMBOL_GPL(dax_pmd_fault);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
@@ -1140,17 +929,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
- void *entry;
+ void *entry, **slot;
pgoff_t index = vmf->pgoff;
spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
- goto out;
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (!entry || !radix_tree_exceptional_entry(entry)) {
+ if (entry)
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+ return VM_FAULT_NOPAGE;
+ }
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- put_unlocked_mapping_entry(mapping, index, entry);
-out:
+ entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * If we race with somebody updating the PTE and finish_mkwrite_fault()
+ * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+ * the fault in either case.
+ */
+ finish_mkwrite_fault(vmf);
+ put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -1191,62 +990,13 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-/**
- * dax_zero_page_range - zero a range within a page of a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @length: The number of bytes to zero
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * This function can be called by a filesystem when it is zeroing part of a
- * page in a DAX file. This is intended for hole-punch operations. If
- * you are truncating a file, the helper function dax_truncate_page() may be
- * more convenient.
- */
-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
- get_block_t get_block)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- struct buffer_head bh;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- int err;
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
- BUG_ON((offset + length) > PAGE_SIZE);
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
- bh.b_size = PAGE_SIZE;
- err = get_block(inode, index, &bh, 0);
- if (err < 0 || !buffer_written(&bh))
- return err;
-
- return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
- offset, length);
+ return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
}
-EXPORT_SYMBOL_GPL(dax_zero_page_range);
-/**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating a DAX file to handle the partial page.
- */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
-{
- unsigned length = PAGE_ALIGN(from) - from;
- return dax_zero_page_range(inode, from, length, get_block);
-}
-EXPORT_SYMBOL_GPL(dax_truncate_page);
-
-#ifdef CONFIG_FS_IOMAP
static loff_t
-iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
{
struct iov_iter *iter = data;
@@ -1265,13 +1015,28 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
return -EIO;
+ /*
+ * Write can allocate block for an area which has a hole page mapped
+ * into page tables. We have to tear down these mappings so that data
+ * written by write(2) is visible in mmap.
+ */
+ if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (end - 1) >> PAGE_SHIFT);
+ }
+
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
struct blk_dax_ctl dax = { 0 };
ssize_t map_len;
- dax.sector = iomap->blkno +
- (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ dax.sector = dax_iomap_sector(iomap, pos);
dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
map_len = dax_map_atomic(iomap->bdev, &dax);
if (map_len < 0) {
@@ -1303,7 +1068,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
/**
- * iomap_dax_rw - Perform I/O to a DAX file
+ * dax_iomap_rw - Perform I/O to a DAX file
* @iocb: The control block for this I/O
* @iter: The addresses to do I/O from or to
* @ops: iomap ops passed from the file system
@@ -1313,7 +1078,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* and evicting any page cache pages in the region under I/O.
*/
ssize_t
-iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
struct iomap_ops *ops)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -1324,26 +1089,9 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE)
flags |= IOMAP_WRITE;
- /*
- * Yes, even DAX files can have page cache attached to them: A zeroed
- * page is inserted into the pagecache when we have to serve a write
- * fault on a hole. It should never be dirtied and can simply be
- * dropped from the pagecache once we get real data for the page.
- *
- * XXX: This is racy against mmap, and there's nothing we can do about
- * it. We'll eventually need to shift this down even further so that
- * we can check if we allocated blocks over a hole first.
- */
- if (mapping->nrpages) {
- ret = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT,
- (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- }
-
while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
- iter, iomap_dax_actor);
+ iter, dax_iomap_actor);
if (ret <= 0)
break;
pos += ret;
@@ -1353,10 +1101,19 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
iocb->ki_pos += done;
return done ? done : ret;
}
-EXPORT_SYMBOL_GPL(iomap_dax_rw);
+EXPORT_SYMBOL_GPL(dax_iomap_rw);
+
+static int dax_fault_return(int error)
+{
+ if (error == 0)
+ return VM_FAULT_NOPAGE;
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
+}
/**
- * iomap_dax_fault - handle a page fault on a DAX file
+ * dax_iomap_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @ops: iomap ops passed from the file system
@@ -1365,17 +1122,18 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
* or mkwrite handler for DAX files. Assumes the caller has done all the
* necessary locking for the page fault to proceed successfully.
*/
-int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
struct iomap_ops *ops)
{
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector;
struct iomap iomap = { 0 };
- unsigned flags = 0;
+ unsigned flags = IOMAP_FAULT;
int error, major = 0;
+ int vmf_ret = 0;
void *entry;
/*
@@ -1386,12 +1144,6 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (pos >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- entry = grab_mapping_entry(mapping, vmf->pgoff);
- if (IS_ERR(entry)) {
- error = PTR_ERR(entry);
- goto out;
- }
-
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
@@ -1402,13 +1154,19 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error)
- goto unlock_entry;
+ return dax_fault_return(error);
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
- error = -EIO; /* fs corruption? */
- goto unlock_entry;
+ vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
+ goto finish_iomap;
+ }
+
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+ if (IS_ERR(entry)) {
+ vmf_ret = dax_fault_return(PTR_ERR(entry));
+ goto finish_iomap;
}
- sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+ sector = dax_iomap_sector(&iomap, pos);
if (vmf->cow_page) {
switch (iomap.type) {
@@ -1427,13 +1185,13 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
if (error)
- goto unlock_entry;
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- return VM_FAULT_LOCKED;
- }
- vmf->entry = entry;
- return VM_FAULT_DAX_LOCKED;
+ goto error_unlock_entry;
+
+ __SetPageUptodate(vmf->cow_page);
+ vmf_ret = finish_fault(vmf);
+ if (!vmf_ret)
+ vmf_ret = VM_FAULT_DONE_COW;
+ goto unlock_entry;
}
switch (iomap.type) {
@@ -1445,11 +1203,16 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
error = dax_insert_mapping(mapping, iomap.bdev, sector,
PAGE_SIZE, &entry, vma, vmf);
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
+ if (error == -EBUSY)
+ error = 0;
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
- if (!(vmf->flags & FAULT_FLAG_WRITE))
- return dax_load_hole(mapping, entry, vmf);
+ if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+ vmf_ret = dax_load_hole(mapping, &entry, vmf);
+ goto unlock_entry;
+ }
/*FALLTHRU*/
default:
WARN_ON_ONCE(1);
@@ -1457,15 +1220,215 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break;
}
+ error_unlock_entry:
+ vmf_ret = dax_fault_return(error) | major;
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
- if (error == -ENOMEM)
- return VM_FAULT_OOM | major;
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if (error < 0 && error != -EBUSY)
- return VM_FAULT_SIGBUS | major;
- return VM_FAULT_NOPAGE | major;
+ finish_iomap:
+ if (ops->iomap_end) {
+ int copied = PAGE_SIZE;
+
+ if (vmf_ret & VM_FAULT_ERROR)
+ copied = 0;
+ /*
+ * The fault is done by now and there's no way back (other
+ * thread may be already happily using PTE we have installed).
+ * Just ignore error from ->iomap_end since we cannot do much
+ * with it.
+ */
+ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
+ }
+ return vmf_ret;
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+#ifdef CONFIG_FS_DAX_PMD
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+ struct vm_fault *vmf, unsigned long address,
+ struct iomap *iomap, loff_t pos, bool write, void **entryp)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct block_device *bdev = iomap->bdev;
+ struct blk_dax_ctl dax = {
+ .sector = dax_iomap_sector(iomap, pos),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+ void *ret;
+
+ if (length < 0) /* dax_map_atomic() failed */
+ return VM_FAULT_FALLBACK;
+ if (length < PMD_SIZE)
+ goto unmap_fallback;
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+ goto unmap_fallback;
+ if (!pfn_t_devmap(dax.pfn))
+ goto unmap_fallback;
+
+ dax_unmap_atomic(bdev, &dax);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+ RADIX_DAX_PMD);
+ if (IS_ERR(ret))
+ return VM_FAULT_FALLBACK;
+ *entryp = ret;
+
+ return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+
+ unmap_fallback:
+ dax_unmap_atomic(bdev, &dax);
+ return VM_FAULT_FALLBACK;
+}
+
+static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+ struct vm_fault *vmf, unsigned long address,
+ struct iomap *iomap, void **entryp)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long pmd_addr = address & PMD_MASK;
+ struct page *zero_page;
+ spinlock_t *ptl;
+ pmd_t pmd_entry;
+ void *ret;
+
+ zero_page = mm_get_huge_zero_page(vma->vm_mm);
+
+ if (unlikely(!zero_page))
+ return VM_FAULT_FALLBACK;
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+ RADIX_DAX_PMD | RADIX_DAX_HZP);
+ if (IS_ERR(ret))
+ return VM_FAULT_FALLBACK;
+ *entryp = ret;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ return VM_FAULT_FALLBACK;
+ }
+
+ pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+ pmd_entry = pmd_mkhuge(pmd_entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
+}
+
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
+ struct inode *inode = mapping->host;
+ int result = VM_FAULT_FALLBACK;
+ struct iomap iomap = { 0 };
+ pgoff_t max_pgoff, pgoff;
+ struct vm_fault vmf;
+ void *entry;
+ loff_t pos;
+ int error;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ goto fallback;
+
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ goto fallback;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ goto fallback;
+
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is
+ * supposed to hold locks serializing us with truncate / punch hole so
+ * this is a reliable test.
+ */
+ pgoff = linear_page_index(vma, pmd_addr);
+ max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ if (pgoff > max_pgoff)
+ return VM_FAULT_SIGBUS;
+
+ /* If the PMD would extend beyond the file size */
+ if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+ goto fallback;
+
+ /*
+ * Note that we don't use iomap_apply here. We aren't doing I/O, only
+ * setting up a mapping, so really we're using iomap_begin() as a way
+ * to look up our filesystem block.
+ */
+ pos = (loff_t)pgoff << PAGE_SHIFT;
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+ if (error)
+ goto fallback;
+
+ if (iomap.offset + iomap.length < pos + PMD_SIZE)
+ goto finish_iomap;
+
+ /*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto finish_iomap;
+
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+ vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+ &iomap, pos, write, &entry);
+ break;
+ case IOMAP_UNWRITTEN:
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(write))
+ goto unlock_entry;
+ result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+ &entry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
+ finish_iomap:
+ if (ops->iomap_end) {
+ int copied = PMD_SIZE;
+
+ if (result == VM_FAULT_FALLBACK)
+ copied = 0;
+ /*
+ * The fault is done by now and there's no way back (other
+ * thread may be already happily using PMD we have installed).
+ * Just ignore error from ->iomap_end since we cannot do much
+ * with it.
+ */
+ ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
+ &iomap);
+ }
+ fallback:
+ if (result == VM_FAULT_FALLBACK) {
+ split_huge_pmd(vma, pmd, address);
+ count_vm_event(THP_FAULT_FALLBACK);
+ }
+ return result;
}
-EXPORT_SYMBOL_GPL(iomap_dax_fault);
-#endif /* CONFIG_FS_IOMAP */
+EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#endif /* CONFIG_FS_DAX_PMD */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc953ac81..95d71eda8142 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/swap.h>
@@ -1273,38 +1273,44 @@ rename_retry:
goto again;
}
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
+struct check_mount {
+ struct vfsmount *mnt;
+ unsigned int mounted;
+};
-static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
+static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
- int *ret = data;
- if (d_mountpoint(dentry)) {
- *ret = 1;
+ struct check_mount *info = data;
+ struct path path = { .mnt = info->mnt, .dentry = dentry };
+
+ if (likely(!d_mountpoint(dentry)))
+ return D_WALK_CONTINUE;
+ if (__path_is_mountpoint(&path)) {
+ info->mounted = 1;
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
}
/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
+ * path_has_submounts - check for mounts over a dentry in the
+ * current namespace.
+ * @parent: path to check.
*
* Return true if the parent or its subdirectories contain
- * a mount point
+ * a mount point in the current namespace.
*/
-int have_submounts(struct dentry *parent)
+int path_has_submounts(const struct path *parent)
{
- int ret = 0;
+ struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
- d_walk(parent, &ret, check_mount, NULL);
+ read_seqlock_excl(&mount_lock);
+ d_walk(parent->dentry, &data, path_check_mount, NULL);
+ read_sequnlock_excl(&mount_lock);
- return ret;
+ return data.mounted;
}
-EXPORT_SYMBOL(have_submounts);
+EXPORT_SYMBOL(path_has_submounts);
/*
* Called by mount code to set a mountpoint and check if the mountpoint is
@@ -1330,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry)
}
spin_lock(&dentry->d_lock);
if (!d_unlinked(dentry)) {
- dentry->d_flags |= DCACHE_MOUNTED;
- ret = 0;
+ ret = -EBUSY;
+ if (!d_mountpoint(dentry)) {
+ dentry->d_flags |= DCACHE_MOUNTED;
+ ret = 0;
+ }
}
spin_unlock(&dentry->d_lock);
out:
diff --git a/fs/dcookies.c b/fs/dcookies.c
index ac44a69fbea9..0d0461cf2431 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -26,7 +26,7 @@
#include <linux/mutex.h>
#include <linux/path.h>
#include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* The dcookies are allocated from a kmem_cache and
* hashed onto a small number of lists. None of the
@@ -90,7 +90,7 @@ static void hash_dcookie(struct dcookie_struct * dcs)
}
-static struct dcookie_struct *alloc_dcookie(struct path *path)
+static struct dcookie_struct *alloc_dcookie(const struct path *path)
{
struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
GFP_KERNEL);
@@ -113,7 +113,7 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
/* This is the main kernel-side routine that retrieves the cookie
* value for a dentry/vfsmnt pair.
*/
-int get_dcookie(struct path *path, unsigned long *cookie)
+int get_dcookie(const struct path *path, unsigned long *cookie)
{
int err = 0;
struct dcookie_struct * dcs;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a7727..c87bae4376b8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio)
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
* filesystems that don't need it and also allows us to create the workqueue
* late enough so the we can include s_id in the name of the workqueue.
*/
-static int sb_init_dio_done_wq(struct super_block *sb)
+int sb_init_dio_done_wq(struct super_block *sb)
{
struct workqueue_struct *old;
struct workqueue_struct *wq = alloc_workqueue("dio/%s",
@@ -843,24 +843,6 @@ out:
}
/*
- * Clean any dirty buffers in the blockdev mapping which alias newly-created
- * file blocks. Only called for S_ISREG files - blockdevs do not set
- * buffer_new
- */
-static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
-{
- unsigned i;
- unsigned nblocks;
-
- nblocks = map_bh->b_size >> dio->inode->i_blkbits;
-
- for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(map_bh->b_bdev,
- map_bh->b_blocknr + i);
- }
-}
-
-/*
* If we are not writing the entire block and get_block() allocated
* the block for us, we need to fill-in the unused portion of the
* block with zeros. This happens only if user-buffer, fileoffset or
@@ -924,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
const unsigned blkbits = sdio->blkbits;
+ const unsigned i_blkbits = blkbits + sdio->blkfactor;
int ret = 0;
while (sdio->block_in_file < sdio->final_block_in_request) {
@@ -960,11 +943,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
goto do_holes;
sdio->blocks_available =
- map_bh->b_size >> sdio->blkbits;
+ map_bh->b_size >> blkbits;
sdio->next_block_for_io =
map_bh->b_blocknr << sdio->blkfactor;
- if (buffer_new(map_bh))
- clean_blockdev_aliases(dio, map_bh);
+ if (buffer_new(map_bh)) {
+ clean_bdev_aliases(
+ map_bh->b_bdev,
+ map_bh->b_blocknr,
+ map_bh->b_size >> i_blkbits);
+ }
if (!sdio->blkfactor)
goto do_holes;
@@ -1209,7 +1196,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->inode = inode;
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
- dio->op_flags = WRITE_ODIRECT;
+ dio->op_flags = REQ_SYNC | REQ_IDLE;
} else {
dio->op = REQ_OP_READ;
}
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dcea1e37a1b7..07fed838d8fd 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -268,7 +268,7 @@ void dlm_callback_work(struct work_struct *work)
int dlm_callback_start(struct dlm_ls *ls)
{
ls->ls_callback_wq = alloc_workqueue("dlm_callback",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
if (!ls->ls_callback_wq) {
log_print("can't start dlm_callback workqueue");
return -ENOMEM;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index df955d2209ce..7211e826d90d 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -12,7 +12,7 @@
******************************************************************************/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/configfs.h>
#include <linux/slab.h>
#include <linux/in.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 466f7d60edc2..ca7089aeadab 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -12,7 +12,7 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 216b61604ef9..748e8d59e611 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -18,7 +18,6 @@
* This is the main header file to be included in each DLM source file.
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/types.h>
@@ -39,7 +38,7 @@
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/ratelimit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/dlm.h>
#include "config.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 35502d4046f5..6df332296c66 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1395,7 +1395,6 @@ static int nodeid_warned(int nodeid, int num_nodes, int *warned)
void dlm_scan_waiters(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
- ktime_t zero = ktime_set(0, 0);
s64 us;
s64 debug_maxus = 0;
u32 debug_scanned = 0;
@@ -1409,7 +1408,7 @@ void dlm_scan_waiters(struct dlm_ls *ls)
mutex_lock(&ls->ls_waiters_mutex);
list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (ktime_equal(lkb->lkb_wait_time, zero))
+ if (!lkb->lkb_wait_time)
continue;
debug_scanned++;
@@ -1419,7 +1418,7 @@ void dlm_scan_waiters(struct dlm_ls *ls)
if (us < dlm_config.ci_waitwarn_us)
continue;
- lkb->lkb_wait_time = zero;
+ lkb->lkb_wait_time = 0;
debug_expired++;
if (us > debug_maxus)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f3e72787e7f9..91592b75c309 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -11,6 +11,8 @@
*******************************************************************************
******************************************************************************/
+#include <linux/module.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609998de533e..7d398d300e97 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -519,29 +519,25 @@ out:
/* Note: sk_callback_lock must be locked before calling this function. */
static void save_callbacks(struct connection *con, struct sock *sk)
{
- lock_sock(sk);
con->orig_data_ready = sk->sk_data_ready;
con->orig_state_change = sk->sk_state_change;
con->orig_write_space = sk->sk_write_space;
con->orig_error_report = sk->sk_error_report;
- release_sock(sk);
}
static void restore_callbacks(struct connection *con, struct sock *sk)
{
write_lock_bh(&sk->sk_callback_lock);
- lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = con->orig_data_ready;
sk->sk_state_change = con->orig_state_change;
sk->sk_write_space = con->orig_write_space;
sk->sk_error_report = con->orig_error_report;
- release_sock(sk);
write_unlock_bh(&sk->sk_callback_lock);
}
/* Make a socket active */
-static void add_sock(struct socket *sock, struct connection *con)
+static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
{
struct sock *sk = sock->sk;
@@ -549,7 +545,7 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock = sock;
sk->sk_user_data = con;
- if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ if (save_cb)
save_callbacks(con, sk);
/* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
@@ -806,7 +802,7 @@ static int tcp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
- add_sock(newsock, othercon);
+ add_sock(newsock, othercon, false);
addcon = othercon;
}
else {
@@ -819,7 +815,10 @@ static int tcp_accept_from_sock(struct connection *con)
else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
- add_sock(newsock, newcon);
+ /* accept copies the sk after we've saved the callbacks, so we
+ don't want to save them a second time or comm errors will
+ result in calling sk_error_report recursively. */
+ add_sock(newsock, newcon, false);
addcon = newcon;
}
@@ -880,7 +879,8 @@ static int sctp_accept_from_sock(struct connection *con)
}
make_sockaddr(&prim.ssp_addr, 0, &addr_len);
- if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+ ret = addr_to_nodeid(&prim.ssp_addr, &nodeid);
+ if (ret) {
unsigned char *b = (unsigned char *)&prim.ssp_addr;
log_print("reject connect from unknown addr");
@@ -919,7 +919,7 @@ static int sctp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
- add_sock(newsock, othercon);
+ add_sock(newsock, othercon, false);
addcon = othercon;
} else {
printk("Extra connection from node %d attempted\n", nodeid);
@@ -930,7 +930,7 @@ static int sctp_accept_from_sock(struct connection *con)
} else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
- add_sock(newsock, newcon);
+ add_sock(newsock, newcon, false);
addcon = newcon;
}
@@ -1058,7 +1058,7 @@ static void sctp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = sctp_connect_to_sock;
- add_sock(sock, con);
+ add_sock(sock, con, true);
/* Bind to all addresses. */
if (sctp_bind_addrs(con, 0))
@@ -1146,7 +1146,7 @@ static void tcp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = tcp_connect_to_sock;
- add_sock(sock, con);
+ add_sock(sock, con, true);
/* Bind to our cluster-known address connecting to avoid
routing problems */
@@ -1366,7 +1366,7 @@ static int tcp_listen_for_all(void)
sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
if (sock) {
- add_sock(sock, con);
+ add_sock(sock, con, true);
result = 0;
}
else {
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 079c0bd71ab7..8e1b618891be 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -11,6 +11,8 @@
*******************************************************************************
******************************************************************************/
+#include <linux/module.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "lock.h"
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 1e6e227134d7..43a96c330570 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -16,11 +16,7 @@
static uint32_t dlm_nl_seqnum;
static uint32_t listener_nlportid;
-static struct genl_family family = {
- .id = GENL_ID_GENERATE,
- .name = DLM_GENL_NAME,
- .version = DLM_GENL_VERSION,
-};
+static struct genl_family family;
static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
{
@@ -69,16 +65,24 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static struct genl_ops dlm_nl_ops[] = {
+static const struct genl_ops dlm_nl_ops[] = {
{
.cmd = DLM_CMD_HELLO,
.doit = user_cmd,
},
};
+static struct genl_family family __ro_after_init = {
+ .name = DLM_GENL_NAME,
+ .version = DLM_GENL_VERSION,
+ .ops = dlm_nl_ops,
+ .n_ops = ARRAY_SIZE(dlm_nl_ops),
+ .module = THIS_MODULE,
+};
+
int __init dlm_netlink_init(void)
{
- return genl_register_family_with_ops(&family, dlm_nl_ops);
+ return genl_register_family(&family);
}
void dlm_netlink_exit(void)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 58c2f4a21b7f..1ce908c2232c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -9,7 +9,6 @@
#include <linux/miscdevice.h>
#include <linux/init.h>
#include <linux/wait.h>
-#include <linux/module.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/poll.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cf390dceddd2..e7413f82d27b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -631,28 +631,23 @@ out_lock:
static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
{
+ DEFINE_DELAYED_CALL(done);
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- char *lower_buf;
+ const char *link;
char *buf;
- mm_segment_t old_fs;
int rc;
- lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!lower_buf)
- return ERR_PTR(-ENOMEM);
- old_fs = get_fs();
- set_fs(get_ds());
- rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
- (char __user *)lower_buf,
- PATH_MAX);
- set_fs(old_fs);
- if (rc < 0)
- goto out;
+ link = vfs_get_link(lower_dentry, &done);
+ if (IS_ERR(link))
+ return ERR_CAST(link);
+
rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
- lower_buf, rc);
-out:
- kfree(lower_buf);
- return rc ? ERR_PTR(rc) : buf;
+ link, strlen(link));
+ do_delayed_call(&done);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return buf;
}
static const char *ecryptfs_get_link(struct dentry *dentry,
@@ -1089,7 +1084,6 @@ out:
}
const struct inode_operations ecryptfs_symlink_iops = {
- .readlink = generic_readlink,
.get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 5bbf9612140c..70f5d4f9a945 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -14,7 +14,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define EFS_VERSION "1.0a"
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 10db91218933..bcb68fcc8445 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,7 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
diff --git a/fs/exec.c b/fs/exec.c
index 4e497b9ee71e..e57946610733 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -19,7 +19,7 @@
* current->executable is only used by the procfs. This allows a dispatch
* table to check for several different types of binary formats. We keep
* trying until we recognize the file or we run out of supported binary
- * formats.
+ * formats.
*/
#include <linux/slab.h>
@@ -58,7 +58,7 @@
#include <linux/compat.h>
#include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* doing the exec and bprm->mm is the new process's mm.
*/
ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
- &page, NULL);
+ &page, NULL, NULL);
if (ret <= 0)
return NULL;
@@ -1169,8 +1169,10 @@ no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
+#ifdef CONFIG_POSIX_TIMERS
exit_itimers(sig);
flush_itimer_signals();
+#endif
if (atomic_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
@@ -1266,6 +1268,13 @@ int flush_old_exec(struct linux_binprm * bprm)
flush_thread();
current->personality &= ~bprm->per_clear;
+ /*
+ * We have to apply CLOEXEC before we change whether the process is
+ * dumpable (in setup_new_exec) to avoid a race with a process in userspace
+ * trying to access the should-be-closed file descriptors of a process
+ * undergoing exec(2).
+ */
+ do_close_on_exec(current->files);
return 0;
out:
@@ -1275,8 +1284,22 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ struct inode *inode = file_inode(file);
+ if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+ /* Ensure mm->user_ns contains the executable */
+ user_ns = old = bprm->mm->user_ns;
+ while ((user_ns != &init_user_ns) &&
+ !privileged_wrt_inode_uidgid(user_ns, inode))
+ user_ns = user_ns->parent;
+
+ if (old != user_ns) {
+ bprm->mm->user_ns = get_user_ns(user_ns);
+ put_user_ns(old);
+ }
+ }
}
EXPORT_SYMBOL(would_dump);
@@ -1306,7 +1329,6 @@ void setup_new_exec(struct linux_binprm * bprm)
!gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
- would_dump(bprm, bprm->file);
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
set_dumpable(current->mm, suid_dumpable);
}
@@ -1315,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm)
group */
current->self_exec_id++;
flush_signal_handlers(current, 0);
- do_close_on_exec(current->files);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1406,7 +1427,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
unsigned n_fs;
if (p->ptrace) {
- if (p->ptrace & PT_PTRACE_CAP)
+ if (ptracer_capable(p, current_user_ns()))
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
else
bprm->unsafe |= LSM_UNSAFE_PTRACE;
@@ -1741,6 +1762,8 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out;
+ would_dump(bprm, bprm->file);
+
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d8072bc074a4..0ac62811b341 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -870,46 +870,31 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
page = *pagep;
if (page == NULL) {
- ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
- fsdata);
- if (ret) {
- EXOFS_DBGMSG("simple_write_begin failed\n");
- goto out;
+ page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
+ flags);
+ if (!page) {
+ EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
+ return -ENOMEM;
}
-
- page = *pagep;
+ *pagep = page;
}
/* read modify write */
if (!PageUptodate(page) && (len != PAGE_SIZE)) {
loff_t i_size = i_size_read(mapping->host);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- size_t rlen;
- if (page->index < end_index)
- rlen = PAGE_SIZE;
- else if (page->index == end_index)
- rlen = i_size & ~PAGE_MASK;
- else
- rlen = 0;
-
- if (!rlen) {
+ if (page->index > end_index) {
clear_highpage(page);
SetPageUptodate(page);
- goto out;
- }
-
- ret = _readpage(page, true);
- if (ret) {
- /*SetPageError was done by _readpage. Is it ok?*/
- unlock_page(page);
- EXOFS_DBGMSG("__readpage failed\n");
+ } else {
+ ret = _readpage(page, true);
+ if (ret) {
+ unlock_page(page);
+ EXOFS_DBGMSG("__readpage failed\n");
+ }
}
}
-out:
- if (unlikely(ret))
- _write_failed(mapping->host, pos + len);
-
return ret;
}
@@ -929,18 +914,25 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- /* According to comment in simple_write_end i_mutex is held */
- loff_t i_size = inode->i_size;
- int ret;
-
- ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
- if (unlikely(ret))
- _write_failed(inode, pos + len);
+ loff_t last_pos = pos + copied;
- /* TODO: once simple_write_end marks inode dirty remove */
- if (i_size != inode->i_size)
+ if (!PageUptodate(page)) {
+ if (copied < len) {
+ _write_failed(inode, pos + len);
+ copied = 0;
+ goto out;
+ }
+ SetPageUptodate(page);
+ }
+ if (last_pos > inode->i_size) {
+ i_size_write(inode, last_pos);
mark_inode_dirty(inode);
- return ret;
+ }
+ set_page_dirty(page);
+out:
+ unlock_page(page);
+ put_page(page);
+ return copied;
}
static int exofs_releasepage(struct page *page, gfp_t gfp)
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 36bea5adcaba..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,5 @@
config EXT2_FS
tristate "Second extended fs support"
- select FS_IOMAP if FS_DAX
help
Ext2 is a standard Linux file system for hard disks.
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a0e1478dfd04..b0f241528a30 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
return 0; /* skip atime */
inode_lock_shared(inode);
- ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+ ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out_unlock;
- ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
mark_inode_dirty(inode);
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return ret;
}
-static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- int ret;
-
- if (flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- }
- down_read(&ei->dax_sem);
-
- ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
-
- up_read(&ei->dax_sem);
- if (flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
- .pmd_fault = ext2_dax_pmd_fault,
+ /*
+ * .pmd_fault is not supported for DAX because allocation in ext2
+ * cannot be reliably aligned to huge page sizes and so pmd faults
+ * will always fail and fail back to regular faults.
+ */
.page_mkwrite = ext2_dax_fault,
.pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 41b8b44a391c..f073bfca694b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode,
}
if (IS_DAX(inode)) {
- int i;
-
/*
* We must unmap blocks before zeroing so that writeback cannot
* overwrite zeros with stale data from block device page cache.
*/
- for (i = 0; i < count; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key),
+ count);
/*
* block must be initialised before we put it in the tree
* so that it's not found by another thread before it's
@@ -754,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
}
- } else {
- *new = true;
}
+ *new = true;
ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex);
@@ -850,6 +846,9 @@ struct iomap_ops ext2_iomap_ops = {
.iomap_begin = ext2_iomap_begin,
.iomap_end = ext2_iomap_end,
};
+#else
+/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
+struct iomap_ops ext2_iomap_ops;
#endif /* CONFIG_FS_DAX */
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -1293,9 +1292,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, ext2_get_block);
- else if (test_opt(inode->i_sb, NOBH))
+ if (IS_DAX(inode)) {
+ error = iomap_zero_range(inode, newsize,
+ PAGE_ALIGN(newsize) - newsize, NULL,
+ &ext2_iomap_ops);
+ } else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
else
@@ -1476,6 +1477,10 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
else
ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ if (i_size_read(inode) < 0) {
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_dtime = 0;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_state = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 9d617423e936..191e02b28ce8 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -14,7 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 6cb042b53b5b..9e25a71fe1a2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,7 +31,7 @@
#include <linux/mount.h>
#include <linux/log2.h>
#include <linux/quotaops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 8437b191bf5d..eeffb0138a17 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -21,7 +21,6 @@
#include "xattr.h"
const struct inode_operations ext2_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
@@ -30,7 +29,6 @@ const struct inode_operations ext2_symlink_inode_operations = {
};
const struct inode_operations ext2_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dfa519979038..fd389935ecd1 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
if (error)
return error;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
}
break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a8a750f59621..2163c1e69f2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -397,8 +397,9 @@ struct flex_groups {
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
+/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
@@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
return container_of(inode, struct ext4_inode_info, vfs_inode);
}
-static inline struct timespec ext4_current_time(struct inode *inode)
-{
- return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
- current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline int ext4_sb_has_crypto(struct super_block *sb)
-{
- return ext4_has_feature_encrypt(sb);
-}
-
static inline bool ext4_encrypted_inode(struct inode *inode)
{
return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
@@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
@@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate(struct inode *);
+extern int ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
@@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
@@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
}
-static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
-{
- int blksize = 1 << inode->i_blkbits;
-
- return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
-}
+extern struct iomap_ops ext4_iomap_ops;
#endif /* __KERNEL__ */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b1d52c14098e..f97611171023 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
+ /* We do not support data journalling for encrypted data */
+ if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ }
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- else
- BUG();
+ BUG();
}
static inline int ext4_should_journal_data(struct inode *inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c930a0110fb4..3e295d3350a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,7 @@
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
@@ -3777,14 +3777,6 @@ out:
return err;
}
-static void unmap_underlying_metadata_blocks(struct block_device *bdev,
- sector_t block, int count)
-{
- int i;
- for (i = 0; i < count; i++)
- unmap_underlying_metadata(bdev, block + i);
-}
-
/*
* Handle EOFBLOCKS_FL flag, clearing it if necessary
*/
@@ -4121,9 +4113,8 @@ out:
* new.
*/
if (allocated > map->m_len) {
- unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
- newblock + map->m_len,
- allocated - map->m_len);
+ clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
+ allocated - map->m_len);
allocated = map->m_len;
}
map->m_len = allocated;
@@ -4631,7 +4622,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+int ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
@@ -4645,7 +4636,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ return err;
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4657,12 +4650,9 @@ retry:
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- if (err) {
- ext4_std_error(inode->i_sb, err);
- return;
- }
- err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
- ext4_std_error(inode->i_sb, err);
+ if (err)
+ return err;
+ return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4701,7 +4691,7 @@ retry:
/*
* Recalculate credits when extent tree depth changes.
*/
- if (depth >= 0 && depth != ext_depth(inode)) {
+ if (depth != ext_depth(inode)) {
credits = ext4_chunk_trans_blocks(inode, len);
depth = ext_depth(inode);
}
@@ -4725,7 +4715,7 @@ retry:
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
@@ -4853,7 +4843,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
@@ -4878,7 +4868,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_dio;
}
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (new_size) {
ext4_update_inode_size(inode, new_size);
} else {
@@ -5568,7 +5558,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
@@ -5678,7 +5668,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a822d30e73f..d663d3d7c81c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,6 +31,42 @@
#include "xattr.h"
#include "acl.h"
+#ifdef CONFIG_FS_DAX
+static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ inode_lock_shared(inode);
+ /*
+ * Recheck under inode lock - at this point we are sure it cannot
+ * change anymore
+ */
+ if (!IS_DAX(inode)) {
+ inode_unlock_shared(inode);
+ /* Fallback to buffered IO in case we cannot support DAX */
+ return generic_file_read_iter(iocb, to);
+ }
+ ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+#endif
+
+static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(file_inode(iocb->ki_filp)))
+ return ext4_dax_read_iter(iocb, to);
+#endif
+ return generic_file_read_iter(iocb, to);
+}
+
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
@@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
return 0;
}
+/* Is IO overwriting allocated and initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+{
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, blklen;
+
+ if (pos + len > i_size_read(inode))
+ return false;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
+ blklen = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of the blocks have been preallocated,
+ * regardless of whether they have been initialized or not. To exclude
+ * unwritten extents, we need to check m_flags.
+ */
+ return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ return ret;
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
+ return -EFBIG;
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+ }
+ return iov_iter_count(from);
+}
+
+#ifdef CONFIG_FS_DAX
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+ bool overwrite = false;
+
+ inode_lock(inode);
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ ret = file_update_time(iocb->ki_filp);
+ if (ret)
+ goto out;
+
+ if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+ overwrite = true;
+ downgrade_write(&inode->i_rwsem);
+ }
+ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+out:
+ if (!overwrite)
+ inode_unlock(inode);
+ else
+ inode_unlock_shared(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+#endif
+
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
int overwrite = 0;
ssize_t ret;
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return ext4_dax_write_iter(iocb, from);
+#endif
+
inode_lock(inode);
- ret = generic_write_checks(iocb, from);
+ ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_unwritten_wait(inode);
}
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-
- if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
- ret = -EFBIG;
- goto out;
- }
- iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
- }
-
iocb->private = &overwrite;
- if (o_direct) {
- size_t length = iov_iter_count(from);
- loff_t pos = iocb->ki_pos;
-
- /* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- pos + length <= i_size_read(inode)) {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, len;
-
- map.m_lblk = pos >> blkbits;
- map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
- len = map.m_len;
-
- err = ext4_map_blocks(NULL, inode, &map, 0);
- /*
- * 'err==len' means that all of blocks has
- * been preallocated no matter they are
- * initialized or not. For excluding
- * unwritten extents, we need to check
- * m_flags. There are two conditions that
- * indicate for initialized extents. 1) If we
- * hit extent cache, EXT4_MAP_MAPPED flag is
- * returned; 2) If we do a real lookup,
- * non-flags are returned. So we should check
- * these two conditions.
- */
- if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
- }
- }
+ /* Check whether we do a DIO overwrite or not */
+ if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+ ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
+ overwrite = 1;
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
@@ -179,7 +258,6 @@ out:
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int result;
- handle_t *handle = NULL;
struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -187,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
- EXT4_DATA_TRANS_BLOCKS(sb));
- } else
- down_read(&EXT4_I(inode)->i_mmap_sem);
-
- if (IS_ERR(handle))
- result = VM_FAULT_SIGBUS;
- else
- result = dax_fault(vma, vmf, ext4_dax_get_block);
-
- if (write) {
- if (!IS_ERR(handle))
- ext4_journal_stop(handle);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ }
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ if (write)
sb_end_pagefault(sb);
- } else
- up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
@@ -213,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
int result;
- handle_t *handle = NULL;
struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = flags & FAULT_FLAG_WRITE;
@@ -221,26 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
- ext4_chunk_trans_blocks(inode,
- PMD_SIZE / PAGE_SIZE));
- } else
- down_read(&EXT4_I(inode)->i_mmap_sem);
-
- if (IS_ERR(handle))
- result = VM_FAULT_SIGBUS;
- else
- result = dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_get_block);
-
- if (write) {
- if (!IS_ERR(handle))
- ext4_journal_stop(handle);
- up_read(&EXT4_I(inode)->i_mmap_sem);
+ }
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
+ &ext4_iomap_ops);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ if (write)
sb_end_pagefault(sb);
- } else
- up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
@@ -687,7 +739,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 170421edfdfe..e57e8d90ea54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,7 +1039,7 @@ got:
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
- ext4_current_time(inode);
+ current_time(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
@@ -1115,8 +1115,7 @@ got:
}
if (encrypt) {
- /* give pointer to avoid set_context with journal ops. */
- err = fscrypt_inherit_context(dir, inode, &encrypt, true);
+ err = fscrypt_inherit_context(dir, inode, handle, true);
if (err)
goto fail_free_drop;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index f74d5ee2cdec..437df6a1a841 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle,
EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ /*
+ * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+ * get cleared
+ */
+ ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
- if (!value)
+ if (!value) {
+ error = -ENOMEM;
goto out;
+ }
error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
value, len);
@@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ /*
+ * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+ * get set.
+ */
+ ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
@@ -1971,7 +1983,7 @@ out:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..88d57af1b516 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
- EXT4_INODE_SIZE(inode->i_sb) -
- offset);
}
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_INODE_SIZE(inode->i_sb) - offset);
}
return csum;
@@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode)
"couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
- if (inode->i_blocks)
- ext4_truncate(inode);
+ if (inode->i_blocks) {
+ err = ext4_truncate(inode);
+ if (err) {
+ ext4_error(inode->i_sb,
+ "couldn't truncate inode %lu (err %d)",
+ inode->i_ino, err);
+ goto stop_handle;
+ }
+ }
/*
* ext4_ext_truncate() doesn't reserve any slop when it
@@ -654,12 +661,8 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- ext4_lblk_t i;
-
- for (i = 0; i < map->m_len; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- map->m_pblk + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -767,6 +770,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_update_bh_state(bh, map.m_flags);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
+ } else if (ret == 0) {
+ /* hole case, need to fill in bh->b_size */
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
}
return ret;
}
@@ -1127,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -1166,7 +1171,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = fscrypt_decrypt_page(page);
+ err = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
return err;
}
#endif
@@ -2360,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
BUG_ON(map->m_len == 0);
if (map->m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map->m_len; i++)
- unmap_underlying_metadata(bdev, map->m_pblk + i);
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
}
return 0;
}
@@ -2891,7 +2894,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb)) {
+ if (ext4_nonda_switch(inode->i_sb) ||
+ S_ISLNK(inode->i_mode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3268,53 +3272,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
#ifdef CONFIG_FS_DAX
-/*
- * Get block function for DAX IO and mmap faults. It takes care of converting
- * unwritten extents to written ones and initializes new / converted blocks
- * to zeros.
- */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap)
{
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned long first_block = offset >> blkbits;
+ unsigned long last_block = (offset + length - 1) >> blkbits;
+ struct ext4_map_blocks map;
int ret;
- ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
- if (!create)
- return _ext4_get_block(inode, iblock, bh_result, 0);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_PRE_IO |
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0)
- return ret;
+ map.m_lblk = first_block;
+ map.m_len = last_block - first_block + 1;
- if (buffer_unwritten(bh_result)) {
+ if (!(flags & IOMAP_WRITE)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ } else {
+ int dio_credits;
+ handle_t *handle;
+ int retries = 0;
+
+ /* Trim mapping request to maximum we can map at once for DIO */
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+retry:
/*
- * We are protected by i_mmap_sem or i_mutex so we know block
- * cannot go away from under us even though we dropped
- * i_data_sem. Convert extent to written and write zeros there.
+ * Either we allocate blocks and then we don't get unwritten
+ * extent so we have reserved enough credits, or the blocks
+ * are already allocated and unwritten and in that case
+ * extent conversion fits in the credits as well.
*/
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_CONVERT |
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0)
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0) {
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
return ret;
+ }
+
+ /*
+ * If we added blocks beyond i_size, we need to make sure they
+ * will get truncated if we crash before updating i_size in
+ * ext4_iomap_end(). For faults we don't need to do that (and
+ * even cannot because for orphan list operations inode_lock is
+ * required) - if we happen to instantiate block beyond i_size,
+ * it is because we race with truncate which has already added
+ * the inode to the orphan list.
+ */
+ if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
+ (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
+ int err;
+
+ err = ext4_orphan_add(handle, inode);
+ if (err < 0) {
+ ext4_journal_stop(handle);
+ return err;
+ }
+ }
+ ext4_journal_stop(handle);
}
- /*
- * At least for now we have to clear BH_New so that DAX code
- * doesn't attempt to zero blocks again in a racy way.
- */
- clear_buffer_new(bh_result);
+
+ iomap->flags = 0;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = first_block << blkbits;
+
+ if (ret == 0) {
+ iomap->type = IOMAP_HOLE;
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->length = (u64)map.m_len << blkbits;
+ } else {
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ } else {
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+ iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+ iomap->length = (u64)map.m_len << blkbits;
+ }
+
+ if (map.m_flags & EXT4_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
return 0;
}
-#else
-/* Just define empty function, it will never get called. */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
{
- BUG();
- return 0;
+ int ret = 0;
+ handle_t *handle;
+ int blkbits = inode->i_blkbits;
+ bool truncate = false;
+
+ if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto orphan_del;
+ }
+ if (ext4_update_inode_size(inode, offset + written))
+ ext4_mark_inode_dirty(handle, inode);
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ if (iomap->offset + iomap->length >
+ ALIGN(inode->i_size, 1 << blkbits)) {
+ ext4_lblk_t written_blk, end_blk;
+
+ written_blk = (offset + written) >> blkbits;
+ end_blk = (offset + length) >> blkbits;
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+ }
+ /*
+ * Remove inode from orphan list if we were extending a inode and
+ * everything went fine.
+ */
+ if (!truncate && inode->i_nlink &&
+ !list_empty(&EXT4_I(inode)->i_orphan))
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ if (truncate) {
+ ext4_truncate_failed_write(inode);
+orphan_del:
+ /*
+ * If truncate failed early the inode might still be on the
+ * orphan list; we need to make sure the inode is removed from
+ * the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret;
}
+
+struct iomap_ops ext4_iomap_ops = {
+ .iomap_begin = ext4_iomap_begin,
+ .iomap_end = ext4_iomap_end,
+};
+
#endif
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3436,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
iocb->private = NULL;
if (overwrite)
get_block_func = ext4_dio_get_block_overwrite;
- else if (IS_DAX(inode)) {
- /*
- * We can avoid zeroing for aligned DAX writes beyond EOF. Other
- * writes need zeroing either because they can race with page
- * faults or because they use partial blocks.
- */
- if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
- ext4_aligned_io(inode, offset, count))
- get_block_func = ext4_dio_get_block;
- else
- get_block_func = ext4_dax_get_block;
- dio_flags = DIO_LOCKING;
- } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
get_block_func = ext4_dio_get_block;
dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
@@ -3462,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
#ifdef CONFIG_EXT4_FS_ENCRYPTION
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
#endif
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, iter, get_block_func,
- ext4_end_io_dio, dio_flags);
- } else
- ret = __blockdev_direct_IO(iocb, inode,
- inode->i_sb->s_bdev, iter,
- get_block_func,
- ext4_end_io_dio, NULL, dio_flags);
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+ get_block_func, ext4_end_io_dio, NULL,
+ dio_flags);
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
@@ -3538,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
/*
@@ -3546,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
* we are protected against page writeback as well.
*/
inode_lock_shared(inode);
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
- } else {
- size_t count = iov_iter_count(iter);
-
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block,
- NULL, NULL, 0);
- }
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+ iocb->ki_pos + count);
+ if (ret)
+ goto out_unlock;
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, ext4_dio_get_block, NULL, NULL, 0);
out_unlock:
inode_unlock_shared(inode);
return ret;
@@ -3587,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ext4_has_inline_data(inode))
return 0;
+ /* DAX uses iomap path now */
+ if (WARN_ON_ONCE(IS_DAX(inode)))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
@@ -3615,6 +3706,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
+static int ext4_set_page_dirty(struct page *page)
+{
+ WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
+ WARN_ON_ONCE(!page_has_buffers(page));
+ return __set_page_dirty_buffers(page);
+}
+
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
@@ -3622,6 +3720,7 @@ static const struct address_space_operations ext4_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
+ .set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3654,6 +3753,7 @@ static const struct address_space_operations ext4_da_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
+ .set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_da_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3743,7 +3843,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
- WARN_ON_ONCE(fscrypt_decrypt_page(page));
+ WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
+ page, PAGE_SIZE, 0, page->index));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3792,8 +3893,10 @@ static int ext4_block_zero_page_range(handle_t *handle,
if (length > max || length < 0)
length = max;
- if (IS_DAX(inode))
- return dax_zero_page_range(inode, from, length, ext4_get_block);
+ if (IS_DAX(inode)) {
+ return iomap_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
+ }
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -4026,7 +4129,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
@@ -4091,10 +4194,11 @@ int ext4_inode_attach_jinode(struct inode *inode)
* that's fine - as long as they are linked from the inode, the post-crash
* ext4_truncate() run will find them and release them.
*/
-void ext4_truncate(struct inode *inode)
+int ext4_truncate(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int credits;
+ int err = 0;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
@@ -4108,7 +4212,7 @@ void ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return;
+ return 0;
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
@@ -4120,13 +4224,13 @@ void ext4_truncate(struct inode *inode)
ext4_inline_data_truncate(inode, &has_inline);
if (has_inline)
- return;
+ return 0;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return;
+ return 0;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4135,10 +4239,8 @@ void ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ext4_std_error(inode->i_sb, PTR_ERR(handle));
- return;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4152,7 +4254,8 @@ void ext4_truncate(struct inode *inode)
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
*/
- if (ext4_orphan_add(handle, inode))
+ err = ext4_orphan_add(handle, inode);
+ if (err)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
@@ -4160,11 +4263,13 @@ void ext4_truncate(struct inode *inode)
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ext4_ext_truncate(handle, inode);
+ err = ext4_ext_truncate(handle, inode);
else
ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
+ if (err)
+ goto out_stop;
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4180,11 +4285,12 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
trace_ext4_truncate_exit(inode);
+ return err;
}
/*
@@ -4352,7 +4458,9 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
+ !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
+ !ext4_encrypted_inode(inode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -4411,7 +4519,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
{
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
+ EXT4_INODE_SIZE(inode->i_sb) &&
+ *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
ext4_find_inline_data_nolock(inode);
} else
@@ -4434,6 +4544,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
+ loff_t size;
int block;
uid_t i_uid;
gid_t i_gid;
@@ -4456,10 +4567,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
- EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
- EXT4_INODE_SIZE(inode->i_sb));
+ EXT4_INODE_SIZE(inode->i_sb) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode,
+ "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -4534,6 +4647,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
+ if ((size = i_size_read(inode)) < 0) {
+ EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -4577,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
+ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
@@ -5154,7 +5273,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* update c/mtime in shrink case below
*/
if (!shrink) {
- inode->i_mtime = ext4_current_time(inode);
+ inode->i_mtime = current_time(inode);
inode->i_ctime = inode->i_mtime;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5199,12 +5318,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
- if (shrink)
- ext4_truncate(inode);
+ if (shrink) {
+ rc = ext4_truncate(inode);
+ if (rc)
+ error = rc;
+ }
up_write(&EXT4_I(inode)->i_mmap_sem);
}
- if (!rc) {
+ if (!error) {
setattr_copy(inode, attr);
mark_inode_dirty(inode);
}
@@ -5216,7 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (orphan && inode->i_nlink)
ext4_orphan_del(NULL, inode);
- if (!rc && (ia_valid & ATTR_MODE))
+ if (!error && (ia_valid & ATTR_MODE))
rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
@@ -5455,18 +5577,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
return err;
- if (ext4_handle_valid(handle) &&
- EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
- * We need extra buffer credits since we may write into EA block
+ * In nojournal mode, we can immediately attempt to expand
+ * the inode. When journaled, we first need to obtain extra
+ * buffer credits since we may write into the EA block
* with this same handle. If journal_extend fails, then it will
* only result in a minor loss of functionality for that inode.
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if ((jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+ if (!ext4_handle_valid(handle) ||
+ jbd2_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) {
ret = ext4_expand_extra_isize(inode,
sbi->s_want_extra_isize,
iloc, handle);
@@ -5620,6 +5744,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
+ /*
+ * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
+ * E.g. S_DAX may get cleared / set.
+ */
+ ext4_set_inode_flags(inode);
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae8ebbc97..d534399cf607 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,7 +15,7 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/uuid.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = inode_bl->i_ctime = current_time(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
@@ -191,6 +191,7 @@ journal_err_out:
return err;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16])
return 0;
return 1;
}
+#endif
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
@@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode,
err = -EOPNOTSUPP;
goto flags_out;
}
- } else if (oldflags & EXT4_EOFBLOCKS_FL)
- ext4_truncate(inode);
+ } else if (oldflags & EXT4_EOFBLOCKS_FL) {
+ err = ext4_truncate(inode);
+ if (err)
+ goto flags_out;
+ }
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
@@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode,
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
+ /* These flags get special treatment later */
+ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
+ continue;
if (mask & flags)
ext4_set_inode_flag(inode, i);
else
@@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
}
ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
return xflags;
}
+#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
+ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
{
@@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ return -EOPNOTSUPP;
+ /*
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
+ */
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ return -EOPNOTSUPP;
+
err = mnt_want_write_file(filp);
if (err)
return err;
- flags = ext4_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
err = ext4_ioctl_setflags(inode, flags);
inode_unlock(inode);
@@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
@@ -765,28 +787,19 @@ resizefs_out:
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
+ case EXT4_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- if (copy_from_user(&policy,
- (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
- return fscrypt_process_policy(filp, &policy);
-#else
- return -EOPNOTSUPP;
-#endif
- }
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
- if (!ext4_sb_has_crypto(sb))
+ if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
err = mnt_want_write_file(filp);
@@ -816,24 +829,13 @@ resizefs_out:
sbi->s_es->s_encrypt_pw_salt, 16))
return -EFAULT;
return 0;
- }
- case EXT4_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
- int err = 0;
-
- if (!ext4_encrypted_inode(inode))
- return -ENOENT;
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
- if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
#else
return -EOPNOTSUPP;
#endif
}
+ case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -865,13 +867,17 @@ resizefs_out:
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
+ return -EOPNOTSUPP;
+
+ flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ return -EOPNOTSUPP;
+
err = mnt_want_write_file(filp);
if (err)
return err;
- flags = ext4_xflags_to_iflags(fa.fsx_xflags);
- flags = ext4_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f418f55c2bbe..7ae43c59bc79 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t min;
ext4_grpblk_t max;
ext4_grpblk_t chunk;
- unsigned short border;
+ unsigned int border;
BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
- ext4_grpblk_t counters[16];
+ ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
} sg;
group--;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d89754ef1aab..eb9835638680 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
}
/*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
* faster.
*/
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
ret = -EIO;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 104f8bfba718..eadba919f26b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
@@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_dec_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
end_unlink:
@@ -3254,7 +3254,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_inc_count(handle, inode);
ihold(inode);
@@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
ent->de->file_type = file_type;
ent->dir->i_version++;
ent->dir->i_ctime = ent->dir->i_mtime =
- ext4_current_time(ent->dir);
+ current_time(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = ext4_current_time(old.inode);
+ old.inode->i_ctime = current_time(old.inode);
ext4_mark_inode_dirty(handle, old.inode);
if (!whiteout) {
@@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new.inode) {
ext4_dec_count(handle, new.inode);
- new.inode->i_ctime = ext4_current_time(new.inode);
+ new.inode->i_ctime = current_time(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
};
u8 new_file_type;
int retval;
+ struct timespec ctime;
if ((ext4_encrypted_inode(old_dir) ||
ext4_encrypted_inode(new_dir)) &&
@@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = ext4_current_time(old.inode);
- new.inode->i_ctime = ext4_current_time(new.inode);
+ ctime = current_time(old.inode);
+ old.inode->i_ctime = ctime;
+ new.inode->i_ctime = ctime;
ext4_mark_inode_dirty(handle, old.inode);
ext4_mark_inode_dirty(handle, new.inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..d83b0f3c5fe9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0;
+ REQ_SYNC : 0;
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
}
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
set_buffer_async_write(bh);
nr_to_submit++;
@@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
- data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+ data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+ page->index, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 52b0530c5d65..66845a08a87a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,7 +38,7 @@
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/cleancache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
- brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb)
}
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
* Now that we are completely done shutting down the
@@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode)
static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
- handle_t *handle;
- int res, res2;
+ handle_t *handle = fs_data;
+ int res, res2, retries = 0;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
- /* fs_data is null when internally used. */
- if (fs_data) {
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
ext4_clear_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - e.g. S_DAX may get disabled
+ */
+ ext4_set_inode_flags(inode);
}
return res;
}
+retry:
handle = ext4_journal_start(inode, EXT4_HT_MISC,
ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle))
return PTR_ERR(handle);
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /* Update inode->i_flags - e.g. S_DAX may get disabled */
+ ext4_set_inode_flags(inode);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
}
res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (!res)
res = res2;
return res;
@@ -1187,7 +1205,7 @@ static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path);
+ const struct path *path);
static int ext4_quota_off(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
- "in data=ordered mode");
- return 0;
- }
return 1;
}
@@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
struct ext4_super_block *es)
{
unsigned int s_flags = sb->s_flags;
- int nr_orphans = 0, nr_truncates = 0;
+ int ret, nr_orphans = 0, nr_truncates = 0;
#ifdef CONFIG_QUOTA
int i;
#endif
@@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
inode->i_ino, inode->i_size);
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
- ext4_truncate(inode);
+ ret = ext4_truncate(inode);
+ if (ret)
+ ext4_std_error(inode->i_sb, ret);
inode_unlock(inode);
nr_truncates++;
} else {
@@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_set_bit(s++, buf);
count++;
}
- for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
- ext4_set_bit(EXT4_B2C(sbi, s++), buf);
- count++;
+ j = ext4_bg_num_gdb(sb, grp);
+ if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_error(sb, "Invalid number of block group "
+ "descriptor blocks: %d", j);
+ j = EXT4_BLOCKS_PER_GROUP(sb) - s;
}
+ count += j;
+ for (; j > 0; j--)
+ ext4_set_bit(EXT4_B2C(sbi, s++), buf);
}
if (!count)
return 0;
@@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
@@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- goto out_free_orig;
+ if ((data && !orig_data) || !sbi)
+ goto out_free_base;
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock) {
- kfree(sbi);
- goto out_free_orig;
- }
+ if (!sbi->s_blockgroup_lock)
+ goto out_free_base;
+
sb->s_fs_info = sbi;
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
- &journal_devnum, &journal_ioprio, 0)) {
- ext4_msg(sb, KERN_WARNING,
- "failed to parse options in superblock: %s",
- sbi->s_es->s_mount_opts);
+ if (sbi->s_es->s_mount_opts[0]) {
+ char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+ sizeof(sbi->s_es->s_mount_opts),
+ GFP_KERNEL);
+ if (!s_mount_opts)
+ goto failed_mount;
+ if (!parse_options(s_mount_opts, sb, &journal_devnum,
+ &journal_ioprio, 0)) {
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+ }
+ kfree(s_mount_opts);
}
sbi->s_def_mount_opt = sbi->s_mount_opt;
if (!parse_options((char *) data, sb, &journal_devnum,
@@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and dax");
goto failed_mount;
}
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
} else {
@@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext4;
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
@@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_cluster_ratio = clustersize / blocksize;
- if (sbi->s_inodes_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#inodes per group too big: %lu",
- sbi->s_inodes_per_group);
- goto failed_mount;
- }
-
/* Do we have standard group size of clustersize * 8 blocks ? */
if (sbi->s_blocks_per_group == clustersize << 3)
set_opt2(sb, STD_GROUP_SIZE);
@@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ goto failed_mount;
+ }
+ }
sbi->s_group_desc = ext4_kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
default:
break;
}
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto failed_mount_wq;
+ }
+
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@ -4160,7 +4204,9 @@ no_journal:
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ "Opts: %.*s%s%s", descr,
+ (int) sizeof(sbi->s_es->s_mount_opts),
+ sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
if (es->s_error_count)
@@ -4239,8 +4285,8 @@ failed_mount:
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
+out_free_base:
kfree(sbi);
-out_free_orig:
kfree(orig_data);
return err ? err : ret;
}
@@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
&EXT4_SB(sb)->s_freeinodes_counter));
BUFFER_TRACE(sbh, "marking dirty");
ext4_superblock_csum_set(sb);
- lock_buffer(sbh);
+ if (sync)
+ lock_buffer(sbh);
if (buffer_write_io_error(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -4566,10 +4613,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
set_buffer_uptodate(sbh);
}
mark_buffer_dirty(sbh);
- unlock_buffer(sbh);
if (sync) {
+ unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
- test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+ test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
if (error)
return error;
@@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
+ } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ err = -EINVAL;
+ goto restore_opts;
+ }
}
if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@ -5239,7 +5293,7 @@ static void lockdep_set_quota_inode(struct inode *inode, int subclass)
* Standard function to be called on quota_on
*/
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path)
+ const struct path *path)
{
int err;
@@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
goto out;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 557b3b0d668c..73b184d161fc 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -83,21 +83,18 @@ errout:
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
.listxattr = ext4_listxattr,
};
const struct inode_operations ext4_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = ext4_setattr,
.listxattr = ext4_listxattr,
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = ext4_setattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d77be9e9f535..5a94fa52b74f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
{
struct ext4_xattr_entry *e = entry;
+ /* Find the end of the names list */
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
@@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
e = next;
}
+ /* Check the values */
while (!IS_LAST_ENTRY(entry)) {
if (entry->e_value_block != 0)
return -EFSCORRUPTED;
- if (entry->e_value_size != 0 &&
- (value_start + le16_to_cpu(entry->e_value_offs) <
- (void *)e + sizeof(__u32) ||
- value_start + le16_to_cpu(entry->e_value_offs) +
- le32_to_cpu(entry->e_value_size) > end))
- return -EFSCORRUPTED;
+ if (entry->e_value_size != 0) {
+ u16 offs = le16_to_cpu(entry->e_value_offs);
+ u32 size = le32_to_cpu(entry->e_value_size);
+ void *value;
+
+ /*
+ * The value cannot overlap the names, and the value
+ * with padding cannot extend beyond 'end'. Check both
+ * the padded and unpadded sizes, since the size may
+ * overflow to 0 when adding padding.
+ */
+ if (offs > end - value_start)
+ return -EFSCORRUPTED;
+ value = value_start + offs;
+ if (value < (void *)e + sizeof(u32) ||
+ size > end - value ||
+ EXT4_XATTR_SIZE(size) > end - value)
+ return -EFSCORRUPTED;
+ }
entry = EXT4_XATTR_NEXT(entry);
}
@@ -231,13 +246,12 @@ static int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
- struct ext4_xattr_entry *entry = IFIRST(header);
int error = -EFSCORRUPTED;
- if (((void *) header >= end) ||
+ if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
(header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
goto errout;
- error = ext4_xattr_check_names(entry, end, entry);
+ error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
errout:
if (error)
__ext4_error_inode(inode, function, line, 0,
@@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
return 0;
}
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+static int ext4_xattr_ibody_set(struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
@@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
}
if (!value) {
if (!is.s.not_found)
- error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
@@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
goto cleanup;
- error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
goto cleanup;
if (!is.s.not_found) {
i.value = NULL;
- error = ext4_xattr_ibody_set(handle, inode, &i,
- &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
}
}
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
if (!value)
ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
goto out;
/* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ error = ext4_xattr_ibody_set(inode, &i, is);
if (error)
goto out;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6fe23af509e1..8f487692c21f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
if (error)
return error;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7e9b504bd8b2..f73ee9534d83 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_META | REQ_PRIO,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+ .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
};
struct blk_plug plug;
@@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
+ ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
}
static int f2fs_write_meta_page(struct page *page,
@@ -770,7 +770,12 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
/* Sanity checking of checkpoint */
if (sanity_check_ckpt(sbi))
- goto fail_no_cp;
+ goto free_fail_no_cp;
+
+ if (cur_page == cp1)
+ sbi->cur_cp_pack = 1;
+ else
+ sbi->cur_cp_pack = 2;
if (cp_blks <= 1)
goto done;
@@ -793,6 +798,9 @@ done:
f2fs_put_page(cp2, 1);
return 0;
+free_fail_no_cp:
+ f2fs_put_page(cp1, 1);
+ f2fs_put_page(cp2, 1);
fail_no_cp:
kfree(sbi->ckpt);
return -EINVAL;
@@ -921,7 +929,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
inode = igrab(&fi->vfs_inode);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
if (inode) {
- update_inode_page(inode);
+ sync_inode_metadata(inode, 0);
+
+ /* it's on eviction */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+ update_inode_page(inode);
iput(inode);
}
};
@@ -987,7 +999,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
- build_free_nids(sbi);
+ build_free_nids(sbi, false);
f2fs_unlock_all(sbi);
}
@@ -998,7 +1010,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&sbi->nr_wb_bios))
+ if (!get_pages(sbi, F2FS_WB_CP_DATA))
break;
io_schedule_timeout(5*HZ);
@@ -1123,7 +1135,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
- start_blk = __start_cp_addr(sbi);
+ start_blk = __start_cp_next_addr(sbi);
/* need to wait for end_io results */
wait_on_all_pages_writeback(sbi);
@@ -1184,9 +1196,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
clear_sbi_flag(sbi, SBI_NEED_CP);
+ __set_cp_next_pack(sbi);
/*
* redirty superblock if metadata like node page or inode cache is
@@ -1261,8 +1273,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
-
- f2fs_wait_all_discard_bio(sbi);
+ if (err) {
+ release_discard_addrs(sbi);
+ } else {
+ clear_prefree_segments(sbi, cpc);
+ f2fs_wait_all_discard_bio(sbi);
+ }
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb..9ac262564fa6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,26 @@
#include "trace.h"
#include <trace/events/f2fs.h>
+static bool __is_cp_guaranteed(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct f2fs_sb_info *sbi;
+
+ if (!mapping)
+ return false;
+
+ inode = mapping->host;
+ sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi) ||
+ inode->i_ino == F2FS_NODE_INO(sbi) ||
+ S_ISDIR(inode->i_mode) ||
+ is_cold_data(page))
+ return true;
+ return false;
+}
+
static void f2fs_read_end_io(struct bio *bio)
{
struct bio_vec *bvec;
@@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ enum count_type type = WB_DATA_TYPE(page);
fscrypt_pullback_bio_page(&page, true);
@@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio)
mapping_set_error(page->mapping, -EIO);
f2fs_stop_checkpoint(sbi, true);
}
+ dec_page_count(sbi, type);
+ clear_cold_data(page);
end_page_writeback(page);
}
- if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+ if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
@@ -88,6 +111,46 @@ static void f2fs_write_end_io(struct bio *bio)
}
/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (FDEV(i).start_blk <= blk_addr &&
+ FDEV(i).end_blk >= blk_addr) {
+ blk_addr -= FDEV(i).start_blk;
+ bdev = FDEV(i).bdev;
+ break;
+ }
+ }
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ }
+ return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+ return i;
+ return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
+/*
* Low-level block read/write IO operations.
*/
static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
@@ -97,8 +160,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio = f2fs_bio_alloc(npages);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ f2fs_target_device(sbi, blk_addr, bio);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
bio->bi_private = is_read ? NULL : sbi;
@@ -109,8 +171,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type)
{
if (!is_read_io(bio_op(bio))) {
- atomic_inc(&sbi->nr_wb_bios);
- if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
current->plug && (type == DATA || type == NODE))
blk_finish_plug(current->plug);
}
@@ -198,11 +259,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
io->fio.op = REQ_OP_WRITE;
- if (test_opt(sbi, NOBARRIER))
- io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
- else
- io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
- REQ_PRIO;
+ io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
+ if (!test_opt(sbi, NOBARRIER))
+ io->fio.op_flags |= REQ_FUA;
}
__submit_merged_bio(io);
out:
@@ -270,22 +329,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
verify_block_addr(sbi, fio->old_blkaddr);
verify_block_addr(sbi, fio->new_blkaddr);
+ bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+ if (!is_read)
+ inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+
down_write(&io->io_rwsem);
if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
- (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
+ (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+ !__same_bdev(sbi, fio->new_blkaddr, io->bio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
io->bio = __bio_alloc(sbi, fio->new_blkaddr,
- bio_blocks, is_read);
+ BIO_MAX_PAGES, is_read);
io->fio = *fio;
}
- bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
-
if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
PAGE_SIZE) {
__submit_merged_bio(io);
@@ -483,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC, false);
+ page = get_read_data_page(inode, index, 0, false);
if (IS_ERR(page))
return page;
@@ -509,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC, for_write);
+ page = get_read_data_page(inode, index, 0, for_write);
if (IS_ERR(page))
return page;
@@ -590,7 +651,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
- int seg = CURSEG_WARM_DATA;
pgoff_t fofs;
blkcnt_t count = 1;
@@ -608,11 +668,8 @@ alloc:
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
- seg = CURSEG_DIRECT_IO;
-
allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
- &sum, seg);
+ &sum, CURSEG_WARM_DATA);
set_data_blkaddr(dn);
/* update i_size */
@@ -624,11 +681,18 @@ alloc:
return 0;
}
-ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
+static inline bool __force_buffered_io(struct inode *inode, int rw)
+{
+ return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+ (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+ F2FS_I_SB(inode)->s_ndevs);
+}
+
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct f2fs_map_blocks map;
- ssize_t ret = 0;
+ int err = 0;
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -640,19 +704,22 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
map.m_next_pgofs = NULL;
if (iocb->ki_flags & IOCB_DIRECT) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ return f2fs_map_blocks(inode, &map, 1,
+ __force_buffered_io(inode, WRITE) ?
+ F2FS_GET_BLOCK_PRE_AIO :
+ F2FS_GET_BLOCK_PRE_DIO);
}
if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
}
if (!f2fs_has_inline_data(inode))
return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- return ret;
+ return err;
}
/*
@@ -676,7 +743,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int ofs_in_node, last_ofs_in_node;
blkcnt_t prealloc;
struct extent_info ei;
- bool allocated = false;
block_t blkaddr;
if (!maxblocks)
@@ -716,7 +782,7 @@ next_dnode:
}
prealloc = 0;
- ofs_in_node = dn.ofs_in_node;
+ last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
@@ -735,10 +801,8 @@ next_block:
}
} else {
err = __allocate_data_block(&dn);
- if (!err) {
+ if (!err)
set_inode_flag(inode, FI_APPEND_WRITE);
- allocated = true;
- }
}
if (err)
goto sync_out;
@@ -793,7 +857,6 @@ skip:
err = reserve_new_blocks(&dn, prealloc);
if (err)
goto sync_out;
- allocated = dn.node_changed;
map->m_len += dn.ofs_in_node - ofs_in_node;
if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -812,9 +875,8 @@ skip:
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
- allocated = false;
goto next_dnode;
sync_out:
@@ -822,7 +884,7 @@ sync_out:
unlock_out:
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
out:
trace_f2fs_map_blocks(inode, map, err);
@@ -834,19 +896,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
pgoff_t *next_pgofs)
{
struct f2fs_map_blocks map;
- int ret;
+ int err;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
map.m_next_pgofs = next_pgofs;
- ret = f2fs_map_blocks(inode, &map, create, flag);
- if (!ret) {
+ err = f2fs_map_blocks(inode, &map, create, flag);
+ if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = map.m_len << inode->i_blkbits;
}
- return ret;
+ return err;
}
static int get_data_block(struct inode *inode, sector_t iblock,
@@ -891,7 +953,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct buffer_head map_bh;
sector_t start_blk, last_blk;
pgoff_t next_pgofs;
- loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
int ret = 0;
@@ -908,13 +969,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
inode_lock(inode);
- isize = i_size_read(inode);
- if (start >= isize)
- goto out;
-
- if (start + len > isize)
- len = isize - start;
-
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
@@ -933,13 +987,11 @@ next:
/* HOLE */
if (!buffer_mapped(&map_bh)) {
start_blk = next_pgofs;
- /* Go through holes util pass the EOF */
- if (blk_to_logical(inode, start_blk) < isize)
+
+ if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+ F2FS_I_SB(inode)->max_file_blocks))
goto prep_next;
- /* Found a hole beyond isize means no more extents.
- * Note that the premise is that filesystems don't
- * punch holes beyond isize and keep size unchanged.
- */
+
flags |= FIEMAP_EXTENT_LAST;
}
@@ -982,7 +1034,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct fscrypt_ctx *ctx = NULL;
- struct block_device *bdev = sbi->sb->s_bdev;
struct bio *bio;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
@@ -1000,8 +1051,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
fscrypt_release_ctx(ctx);
return ERR_PTR(-ENOMEM);
}
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+ f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = ctx;
@@ -1096,7 +1146,8 @@ got_it:
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != block_nr - 1)) {
+ if (bio && (last_block_in_bio != block_nr - 1 ||
+ !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
submit_and_realloc:
__submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
@@ -1195,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
fio->old_blkaddr);
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
- gfp_flags);
+ PAGE_SIZE, 0,
+ fio->page->index,
+ gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
if (err == -ENOMEM) {
@@ -1251,7 +1304,7 @@ static int f2fs_write_data_page(struct page *page,
.sbi = sbi,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
@@ -1311,7 +1364,6 @@ done:
if (err && err != -ENOENT)
goto redirty_out;
- clear_cold_data(page);
out:
inode_dec_dirty_pages(inode);
if (err)
@@ -1332,6 +1384,8 @@ out:
redirty_out:
redirty_page_for_writepage(wbc, page);
+ if (!err)
+ return AOP_WRITEPAGE_ACTIVATE;
unlock_page(page);
return err;
}
@@ -1427,6 +1481,15 @@ continue_unlock:
ret = mapping->a_ops->writepage(page, wbc);
if (unlikely(ret)) {
+ /*
+ * keep nr_to_write, since vfs uses this to
+ * get # of written pages.
+ */
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ continue;
+ }
done_index = page->index + 1;
done = 1;
break;
@@ -1663,7 +1726,7 @@ repeat:
err = PTR_ERR(bio);
goto fail;
}
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ bio->bi_opf = REQ_OP_READ;
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
err = -EFAULT;
@@ -1714,7 +1777,6 @@ static int f2fs_write_end(struct file *file,
goto unlock_out;
set_page_dirty(page);
- clear_cold_data(page);
if (pos + copied > i_size_read(inode))
f2fs_i_size_write(inode, pos + copied);
@@ -1751,9 +1813,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (err)
return err;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return 0;
- if (test_opt(F2FS_I_SB(inode), LFS))
+ if (__force_buffered_io(inode, rw))
return 0;
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
@@ -1785,12 +1845,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
return;
if (PageDirty(page)) {
- if (inode->i_ino == F2FS_META_INO(sbi))
+ if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
- else if (inode->i_ino == F2FS_NODE_INO(sbi))
+ } else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_NODES);
- else
+ } else {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
}
/* This is atomic written page, keep Private */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fb245bd302e4..fbd5184140d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->wb_bios = atomic_read(&sbi->nr_wb_bios);
+ si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+ si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -74,7 +75,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
- si->fnids = NM_I(sbi)->fcnt;
+ si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+ si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -194,7 +196,9 @@ get_cache:
si->cache_mem += sizeof(struct flush_cmd_control);
/* free nids */
- si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
+ NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+ sizeof(struct free_nid);
si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
@@ -310,22 +314,22 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n",
- si->inmem_pages, si->wb_bios);
- seq_printf(s, " - nodes: %4lld in %4d\n",
+ seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n",
+ si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data);
+ seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n",
+ seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
- seq_printf(s, " - datas: %4lld in files:%4d\n",
+ seq_printf(s, " - datas: %4d in files:%4d\n",
si->ndirty_data, si->ndirty_files);
- seq_printf(s, " - meta: %4lld in %4d\n",
+ seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - imeta: %4lld\n",
+ seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
- seq_printf(s, " - free_nids: %9d\n",
- si->fnids);
+ seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n",
+ si->free_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -373,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file)
}
static const struct file_operations stat_fops = {
+ .owner = THIS_MODULE,
.open = stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 369f4513be37..827c5daef4fc 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
/* show encrypted name */
if (fname->hash) {
- if (de->hash_code == fname->hash)
+ if (de->hash_code == cpu_to_le32(fname->hash))
goto found;
} else if (de_name.len == name->len &&
de->hash_code == namehash &&
@@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(inode, FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
f2fs_i_depth_write(dir, current_depth);
@@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
@@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
ClearPagePrivate(page);
ClearPageUptodate(page);
inode_dec_dirty_pages(dir);
+ remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
}
@@ -784,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
-bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
unsigned int start_pos, struct fscrypt_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
@@ -819,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
(u32)de->hash_code, 0,
&de_name, fstr);
if (err)
- return true;
+ return err;
de_name = *fstr;
fstr->len = save_len;
@@ -827,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (!dir_emit(ctx, de_name.name, de_name.len,
le32_to_cpu(de->ino), d_type))
- return true;
+ return 1;
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
- return false;
+ return 0;
}
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -871,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
dentry_page = get_lock_data_page(inode, n, false);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ err = 0;
continue;
- else
+ } else {
goto out;
+ }
}
dentry_blk = kmap(dentry_page);
make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+ err = f2fs_fill_dentries(ctx, &d,
+ n * NR_DENTRY_IN_BLOCK, &fstr);
+ if (err) {
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
break;
@@ -891,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
- err = 0;
out:
fscrypt_fname_free_buffer(&fstr);
- return err;
+ return err < 0 ? err : 0;
}
static int f2fs_dir_open(struct inode *inode, struct file *filp)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2b06d4fcd954..4db44da7ef69 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode,
if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
largest->len = 0;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168a..2da8c3aa0ce5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -103,7 +103,7 @@ struct f2fs_mount_info {
};
#define F2FS_FEATURE_ENCRYPT 0x0001
-#define F2FS_FEATURE_HMSMR 0x0002
+#define F2FS_FEATURE_BLKZONED 0x0002
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -401,6 +401,7 @@ struct f2fs_map_blocks {
#define FADVISE_LOST_PINO_BIT 0x02
#define FADVISE_ENCRYPT_BIT 0x04
#define FADVISE_ENC_NAME_BIT 0x08
+#define FADVISE_KEEP_SIZE_BIT 0x10
#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -413,6 +414,8 @@ struct f2fs_map_blocks {
#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
#define DEF_DIR_LEVEL 0
@@ -428,7 +431,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- struct percpu_counter dirty_pages; /* # of dirty pages */
+ atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
@@ -493,20 +496,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
-extern void f2fs_mark_inode_dirty_sync(struct inode *);
+extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
static inline void __try_update_largest_extent(struct inode *inode,
struct extent_tree *et, struct extent_node *en)
{
if (en->ei.len > et->largest.len) {
et->largest = en->ei;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
+enum nid_list {
+ FREE_NID_LIST,
+ ALLOC_NID_LIST,
+ MAX_NID_LIST,
+};
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
- nid_t available_nids; /* maximum available node ids */
+ nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
@@ -522,9 +531,9 @@ struct f2fs_nm_info {
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
- struct list_head free_nid_list; /* a list for free nids */
- spinlock_t free_nid_list_lock; /* protect free nid list */
- unsigned int fcnt; /* the number of free node id */
+ struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
+ unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */
+ spinlock_t nid_list_lock; /* protect nid lists ops */
struct mutex build_lock; /* lock for build free nids */
/* for checkpoint */
@@ -585,7 +594,6 @@ enum {
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
NO_CHECK_TYPE,
- CURSEG_DIRECT_IO, /* to use for the direct IO path */
};
struct flush_cmd {
@@ -649,6 +657,7 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
+#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -656,6 +665,8 @@ enum count_type {
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
F2FS_DIRTY_IMETA,
+ F2FS_WB_CP_DATA,
+ F2FS_WB_DATA,
NR_COUNT_TYPE,
};
@@ -688,7 +699,7 @@ struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int op; /* contains REQ_OP_ */
- int op_flags; /* rq_flag_bits */
+ int op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
@@ -704,6 +715,20 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+#define FDEV(i) (sbi->devs[i])
+#define RDEV(i) (raw_super->devs[i])
+struct f2fs_dev_info {
+ struct block_device *bdev;
+ char path[MAX_PATH_LEN];
+ unsigned int total_segments;
+ block_t start_blk;
+ block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int nr_blkz; /* Total number of zones */
+ u8 *blkz_type; /* Array of zones type */
+#endif
+};
+
enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
@@ -750,6 +775,12 @@ struct f2fs_sb_info {
u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
u8 key_prefix_size;
#endif
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */
+#endif
+
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
struct inode *node_inode; /* cache node blocks */
@@ -764,6 +795,7 @@ struct f2fs_sb_info {
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
@@ -815,10 +847,9 @@ struct f2fs_sb_info {
block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
- atomic_t nr_wb_bios; /* # of writeback bios */
/* # of pages, see count_type */
- struct percpu_counter nr_pages[NR_COUNT_TYPE];
+ atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
@@ -863,6 +894,8 @@ struct f2fs_sb_info {
/* For shrinker support */
struct list_head s_list;
+ int s_ndevs; /* number of devices */
+ struct f2fs_dev_info *devs; /* for device list */
struct mutex umount_mutex;
unsigned int shrinker_run_no;
@@ -1105,13 +1138,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock(&sbi->cp_lock);
}
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
-{
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
-
- return blk_queue_discard(q);
-}
-
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
down_read(&sbi->cp_rwsem);
@@ -1232,9 +1258,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- percpu_counter_inc(&sbi->nr_pages[count_type]);
+ atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+ if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
return;
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -1242,14 +1269,14 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_pages(struct inode *inode)
{
- percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
+ atomic_inc(&F2FS_I(inode)->dirty_pages);
inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- percpu_counter_dec(&sbi->nr_pages[count_type]);
+ atomic_dec(&sbi->nr_pages[count_type]);
}
static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1258,19 +1285,19 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
!S_ISLNK(inode->i_mode))
return;
- percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
+ atomic_dec(&F2FS_I(inode)->dirty_pages);
dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
- return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
+ return atomic_read(&sbi->nr_pages[count_type]);
}
-static inline s64 get_dirty_pages(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
{
- return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
+ return atomic_read(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -1329,22 +1356,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
{
- block_t start_addr;
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- unsigned long long ckpt_version = cur_cp_version(ckpt);
-
- start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
- /*
- * odd numbered checkpoint should at cp segment 0
- * and even segment must be at cp segment 1
- */
- if (!(ckpt_version & 1))
+ if (sbi->cur_cp_pack == 2)
start_addr += sbi->blocks_per_seg;
+ return start_addr;
+}
+
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ if (sbi->cur_cp_pack == 1)
+ start_addr += sbi->blocks_per_seg;
return start_addr;
}
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+ sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
{
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
@@ -1621,7 +1653,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
return;
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -1648,7 +1680,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
F2FS_I(inode)->i_acl_mode = mode;
set_inode_flag(inode, FI_ACL_MODE);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static inline void f2fs_i_links_write(struct inode *inode, bool inc)
@@ -1657,7 +1689,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
inc_nlink(inode);
else
drop_nlink(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_blocks_write(struct inode *inode,
@@ -1668,7 +1700,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
inode->i_blocks = add ? inode->i_blocks + diff :
inode->i_blocks - diff;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
@@ -1682,34 +1714,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
-static inline bool f2fs_skip_inode_update(struct inode *inode)
-{
- if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
- return false;
- return F2FS_I(inode)->last_disk_size == i_size_read(inode);
-}
-
static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
{
F2FS_I(inode)->i_current_depth = depth;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
{
F2FS_I(inode)->i_xattr_nid = xnid;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
{
F2FS_I(inode)->i_pino = pino;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@ -1837,13 +1862,31 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise |= type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void clear_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise &= ~type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+ if (dsync) {
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ bool ret;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ ret = list_empty(&F2FS_I(inode)->gdirty_list);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return ret;
+ }
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+ file_keep_isize(inode) ||
+ i_size_read(inode) & PAGE_MASK)
+ return false;
+ return F2FS_I(inode)->last_disk_size == i_size_read(inode);
}
static inline int f2fs_readonly(struct super_block *sb)
@@ -1955,7 +1998,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t);
unsigned char get_de_type(struct f2fs_dir_entry *);
struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
unsigned int, struct fscrypt_str *);
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
@@ -1995,7 +2038,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
/*
* super.c
*/
-int f2fs_inode_dirtied(struct inode *);
+int f2fs_inode_dirtied(struct inode *, bool);
void f2fs_inode_synced(struct inode *);
int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
@@ -2034,7 +2077,7 @@ void move_node_page(struct page *, int);
int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
struct writeback_control *, bool);
int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *);
+void build_free_nids(struct f2fs_sb_info *, bool);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -2060,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
@@ -2132,12 +2175,15 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
void f2fs_flush_merged_bios(struct f2fs_sb_info *);
int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *,
+ block_t, struct bio *);
+int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
void set_data_blkaddr(struct dnode_of_data *);
void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
+int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
@@ -2160,7 +2206,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool);
+int f2fs_gc(struct f2fs_sb_info *, bool, bool);
void build_gc_manager(struct f2fs_sb_info *);
/*
@@ -2181,12 +2227,12 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
- s64 inmem_pages;
+ int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+ int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, ndirty_all;
- int nats, dirty_nats, sits, dirty_sits, fnids;
+ int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, wb_bios;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
int inline_xattr, inline_inode, inline_dir, orphans;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
@@ -2412,9 +2458,30 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
}
-static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkaddr)
+{
+ unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).bdev == bdev)
+ return FDEV(i).blkz_type[zno];
+ return -EINVAL;
+}
+#endif
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+ struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+ return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -2453,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c7865073cd26..49f10dce817d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -94,8 +94,6 @@ mapped:
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
- /* if gced page is attached, don't write to cold segment */
- clear_cold_data(page);
out:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
@@ -210,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync && !f2fs_skip_inode_update(inode)) {
+ if (!f2fs_skip_inode_update(inode, datasync)) {
f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -264,7 +262,7 @@ sync_nodes:
}
if (need_inode_block_update(sbi, ino)) {
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
f2fs_write_inode(inode, NULL);
goto sync_nodes;
}
@@ -632,7 +630,7 @@ int f2fs_truncate(struct inode *inode)
return err;
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -679,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
+ bool size_changed = false;
err = setattr_prepare(dentry, attr);
if (err)
@@ -694,7 +693,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
err = f2fs_truncate(inode);
if (err)
return err;
- f2fs_balance_fs(F2FS_I_SB(inode), true);
} else {
/*
* do not trim all blocks after i_size if target size is
@@ -710,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
inode->i_mtime = inode->i_ctime = current_time(inode);
}
+
+ size_changed = true;
}
__setattr_copy(inode, attr);
@@ -722,7 +722,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- f2fs_mark_inode_dirty_sync(inode);
+ /* file size may changed here */
+ f2fs_mark_inode_dirty_sync(inode, size_changed);
+
+ /* inode change will produce dirty node pages flushed by checkpoint */
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
return err;
}
@@ -967,7 +972,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
new_size = (dst + i) << PAGE_SHIFT;
if (dst_inode->i_size < new_size)
f2fs_i_size_write(dst_inode, new_size);
- } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
+ } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
f2fs_put_dnode(&dn);
} else {
@@ -1218,6 +1223,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
if (ret)
goto out;
@@ -1313,15 +1321,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
- int ret;
+ int err;
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret)
- return ret;
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ return err;
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
f2fs_balance_fs(sbi, true);
@@ -1333,12 +1341,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (off_end)
map.m_len++;
- ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- if (ret) {
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (err) {
pgoff_t last_off;
if (!map.m_len)
- return ret;
+ return err;
last_off = map.m_lblk + map.m_len - 1;
@@ -1352,7 +1360,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
f2fs_i_size_write(inode, new_size);
- return ret;
+ return err;
}
static long f2fs_fallocate(struct file *file, int mode,
@@ -1393,7 +1401,9 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1526,7 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
- "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+ "Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
@@ -1752,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
-
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return fscrypt_process_policy(filp, &policy);
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
- struct inode *inode = file_inode(filp);
- int err;
-
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
-
- if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
}
static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1842,7 +1837,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
mutex_lock(&sbi->gc_mutex);
}
- ret = f2fs_gc(sbi, sync);
+ ret = f2fs_gc(sbi, sync, true);
out:
mnt_drop_write_file(filp);
return ret;
@@ -2256,12 +2251,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0) {
- ret = f2fs_preallocate_blocks(iocb, from);
- if (!ret) {
- blk_start_plug(&plug);
- ret = __generic_file_write_iter(iocb, from);
- blk_finish_plug(&plug);
+ int err = f2fs_preallocate_blocks(iocb, from);
+
+ if (err) {
+ inode_unlock(inode);
+ return err;
}
+ blk_start_plug(&plug);
+ ret = __generic_file_write_iter(iocb, from);
+ blk_finish_plug(&plug);
}
inode_unlock(inode);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6f14ee923acd..88bfc3dff496 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -82,7 +82,7 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -544,13 +544,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return true;
}
-static void move_encrypted_block(struct inode *inode, block_t bidx)
+static void move_encrypted_block(struct inode *inode, block_t bidx,
+ unsigned int segno, int off)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC,
+ .op_flags = 0,
.encrypted_page = NULL,
};
struct dnode_of_data dn;
@@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
if (!page)
return;
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
if (err)
@@ -625,7 +629,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
fio.op = REQ_OP_WRITE;
- fio.op_flags = WRITE_SYNC;
+ fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_mbio(&fio);
@@ -645,7 +649,8 @@ out:
f2fs_put_page(page, 1);
}
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+ unsigned int segno, int off)
{
struct page *page;
@@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
if (IS_ERR(page))
return;
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
if (gc_type == BG_GC) {
if (PageWriteback(page))
goto out;
@@ -663,7 +671,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
.sbi = F2FS_I_SB(inode),
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC,
+ .op_flags = REQ_SYNC,
.page = page,
.encrypted_page = NULL,
};
@@ -673,8 +681,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
retry:
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
set_cold_data(page);
@@ -683,8 +693,6 @@ retry:
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
-
- clear_cold_data(page);
}
out:
f2fs_put_page(page, 1);
@@ -794,9 +802,9 @@ next_step:
start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- move_encrypted_block(inode, start_bidx);
+ move_encrypted_block(inode, start_bidx, segno, off);
else
- move_data_page(inode, start_bidx, gc_type);
+ move_data_page(inode, start_bidx, gc_type, segno, off);
if (locked) {
up_write(&fi->dio_rwsem[WRITE]);
@@ -899,7 +907,7 @@ next:
return sec_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
{
unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
@@ -940,6 +948,9 @@ gc_more:
if (ret)
goto stop;
}
+ } else if (gc_type == BG_GC && !background) {
+ /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+ goto stop;
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5f1a67f756af..e32a9e527968 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
.sbi = F2FS_I_SB(dn->inode),
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_PRIO,
.page = page,
.encrypted_page = NULL,
};
@@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
fio.old_blkaddr = dn->data_blkaddr;
write_data_page(dn, &fio);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (dirty)
+ if (dirty) {
inode_dec_dirty_pages(dn->inode);
+ remove_dirty_inode(dn->inode);
+ }
/* this converted inline_data should be recovered. */
set_inode_flag(dn->inode, FI_APPEND_WRITE);
@@ -419,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
}
new_name.name = d.filename[bit_pos];
- new_name.len = de->name_len;
+ new_name.len = le16_to_cpu(de->name_len);
ino = le32_to_cpu(de->ino);
fake_mode = get_de_type(de) << S_SHIFT;
@@ -573,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_put_page(page, 1);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
@@ -610,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct f2fs_inline_dentry *inline_dentry = NULL;
struct page *ipage = NULL;
struct f2fs_dentry_ptr d;
+ int err;
if (ctx->pos == NR_INLINE_DENTRY)
return 0;
@@ -622,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
- if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
+ err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+ if (!err)
ctx->pos = NR_INLINE_DENTRY;
f2fs_put_page(ipage, 1);
- return 0;
+ return err < 0 ? err : 0;
}
int f2fs_inline_data_fiemap(struct inode *inode,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d7369895a78a..af06bda51a54 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -19,10 +19,11 @@
#include <trace/events/f2fs.h>
-void f2fs_mark_inode_dirty_sync(struct inode *inode)
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
{
- if (f2fs_inode_dirtied(inode))
+ if (f2fs_inode_dirtied(inode, sync))
return;
+
mark_inode_dirty_sync(inode);
}
@@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -252,6 +253,7 @@ retry:
int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
f2fs_inode_synced(inode);
@@ -267,11 +269,13 @@ int update_inode(struct inode *inode, struct page *node_page)
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
- if (F2FS_I(inode)->extent_tree)
- set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
- &ri->i_ext);
- else
+ if (et) {
+ read_lock(&et->lock);
+ set_raw_extent(&et->largest, &ri->i_ext);
+ read_unlock(&et->lock);
+ } else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+ }
set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -335,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- if (update_inode_page(inode))
+ if (update_inode_page(inode) && wbc && wbc->nr_to_write)
f2fs_balance_fs(sbi, true);
return 0;
}
@@ -373,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode)
goto no_delete;
#endif
+ remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+
sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
@@ -384,6 +391,8 @@ retry:
f2fs_lock_op(sbi);
err = remove_inode_page(inode);
f2fs_unlock_op(sbi);
+ if (err == -ENOENT)
+ err = 0;
}
/* give more chances, if ENOMEM case */
@@ -403,10 +412,12 @@ no_delete:
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(inode, FI_APPEND_WRITE))
- add_ino_entry(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
- add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ if (inode->i_nlink) {
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ }
if (is_inode_flag_set(inode, FI_FREE_NID)) {
alloc_nid_failed(sbi, inode->i_ino);
clear_inode_flag(inode, FI_FREE_NID);
@@ -424,6 +435,18 @@ void handle_failed_inode(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct node_info ni;
+ /*
+ * clear nlink of inode in order to release resource of inode
+ * immediately.
+ */
+ clear_nlink(inode);
+
+ /*
+ * we must call this to avoid inode being remained as dirty, resulting
+ * in a panic when flushing dirty inodes in gdirty_list.
+ */
+ update_inode_page(inode);
+
/* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 489fa0d5f914..56c19b0610a8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
- f2fs_mark_inode_dirty_sync(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir, false);
f2fs_unlock_op(sbi);
@@ -1075,7 +1075,6 @@ errout:
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
@@ -1105,7 +1104,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
};
const struct inode_operations f2fs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ecdeab8..b9078fdb3743 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
* give 25%, 25%, 50%, 50%, 50% memory for each components respectively
*/
if (type == FREE_NIDS) {
- mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
- PAGE_SHIFT;
+ mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+ sizeof(struct free_nid)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
@@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
} else {
- f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
- nat_get_blkaddr(e) != ne->block_addr ||
+ f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+ nat_get_blkaddr(e) !=
+ le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
}
}
@@ -1134,7 +1135,7 @@ repeat:
if (!page)
return ERR_PTR(-ENOMEM);
- err = read_node_page(page, READ_SYNC);
+ err = read_node_page(page, 0);
if (err < 0) {
f2fs_put_page(page, 1);
return ERR_PTR(err);
@@ -1204,6 +1205,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
ret = f2fs_write_inline_data(inode, page);
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
if (ret)
set_page_dirty(page);
page_out:
@@ -1338,7 +1340,8 @@ retry:
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_put_page(last_page, 0);
pagevec_release(&pvec);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
if (!IS_DNODE(page) || !is_cold_node(page))
@@ -1407,11 +1410,12 @@ continue_unlock:
"Retry to write fsync mark: ino=%u, idx=%lx",
ino, last_page->index);
lock_page(last_page);
+ f2fs_wait_on_page_writeback(last_page, NODE, true);
set_page_dirty(last_page);
unlock_page(last_page);
goto retry;
}
-
+out:
if (nwritten)
f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
return ret ? -EIO: 0;
@@ -1570,7 +1574,7 @@ static int f2fs_write_node_page(struct page *page,
.sbi = sbi,
.type = NODE,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
@@ -1692,11 +1696,35 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
- struct free_nid *i)
+static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool new)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ if (new) {
+ int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+ if (err)
+ return err;
+ }
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]++;
+ list_add_tail(&i->list, &nm_i->nid_list[list]);
+ return 0;
+}
+
+static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool reuse)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]--;
list_del(&i->list);
- radix_tree_delete(&nm_i->free_nid_root, i->nid);
+ if (!reuse)
+ radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
@@ -1704,9 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
struct nat_entry *ne;
-
- if (!available_free_memory(sbi, FREE_NIDS))
- return -1;
+ int err;
/* 0 nid should not be used */
if (unlikely(nid == 0))
@@ -1729,33 +1755,30 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
return 0;
}
- spin_lock(&nm_i->free_nid_list_lock);
- if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
- spin_unlock(&nm_i->free_nid_list_lock);
- radix_tree_preload_end();
+ spin_lock(&nm_i->nid_list_lock);
+ err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+ spin_unlock(&nm_i->nid_list_lock);
+ radix_tree_preload_end();
+ if (err) {
kmem_cache_free(free_nid_slab, i);
return 0;
}
- list_add_tail(&i->list, &nm_i->free_nid_list);
- nm_i->fcnt++;
- spin_unlock(&nm_i->free_nid_list_lock);
- radix_tree_preload_end();
return 1;
}
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
bool need_free = false;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
need_free = true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
@@ -1778,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
- if (blk_addr == NULL_ADDR) {
- if (add_free_nid(sbi, start_nid, true) < 0)
- break;
- }
+ if (blk_addr == NULL_ADDR)
+ add_free_nid(sbi, start_nid, true);
}
}
-void build_free_nids(struct f2fs_sb_info *sbi)
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1794,7 +1815,10 @@ void build_free_nids(struct f2fs_sb_info *sbi)
nid_t nid = nm_i->next_scan_nid;
/* Enough entries */
- if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
+ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+ return;
+
+ if (!sync && !available_free_memory(sbi, FREE_NIDS))
return;
/* readahead nat pages to be scanned */
@@ -1830,7 +1854,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
if (addr == NULL_ADDR)
add_free_nid(sbi, nid, true);
else
- remove_free_nid(nm_i, nid);
+ remove_free_nid(sbi, nid);
}
up_read(&curseg->journal_rwsem);
up_read(&nm_i->nat_tree_lock);
@@ -1839,6 +1863,13 @@ void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->ra_nid_pages, META_NAT, false);
}
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+{
+ mutex_lock(&NM_I(sbi)->build_lock);
+ __build_free_nids(sbi, sync);
+ mutex_unlock(&NM_I(sbi)->build_lock);
+}
+
/*
* If this function returns success, caller can obtain a new nid
* from second parameter of this function.
@@ -1853,31 +1884,31 @@ retry:
if (time_to_inject(sbi, FAULT_ALLOC_NID))
return false;
#endif
- if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
- return false;
+ spin_lock(&nm_i->nid_list_lock);
- spin_lock(&nm_i->free_nid_list_lock);
+ if (unlikely(nm_i->available_nids == 0)) {
+ spin_unlock(&nm_i->nid_list_lock);
+ return false;
+ }
/* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
- list_for_each_entry(i, &nm_i->free_nid_list, list)
- if (i->state == NID_NEW)
- break;
-
- f2fs_bug_on(sbi, i->state != NID_NEW);
+ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
+ f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
+ i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+ struct free_nid, list);
*nid = i->nid;
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
i->state = NID_ALLOC;
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
return true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
/* Let's scan nat pages and its caches to get free nids */
- mutex_lock(&nm_i->build_lock);
- build_free_nids(sbi);
- mutex_unlock(&nm_i->build_lock);
+ build_free_nids(sbi, true);
goto retry;
}
@@ -1889,11 +1920,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, !i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
}
@@ -1910,17 +1941,22 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
if (!nid)
return;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i);
+
if (!available_free_memory(sbi, FREE_NIDS)) {
- __del_from_free_nid_list(nm_i, i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
need_free = true;
} else {
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
i->state = NID_NEW;
- nm_i->fcnt++;
+ __insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
}
- spin_unlock(&nm_i->free_nid_list_lock);
+
+ nm_i->available_nids++;
+
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
@@ -1932,24 +1968,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
struct free_nid *i, *next;
int nr = nr_shrink;
- if (nm_i->fcnt <= MAX_FREE_NIDS)
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
return 0;
if (!mutex_trylock(&nm_i->build_lock))
return 0;
- spin_lock(&nm_i->free_nid_list_lock);
- list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ if (nr_shrink <= 0 ||
+ nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
break;
- if (i->state == NID_ALLOC)
- continue;
- __del_from_free_nid_list(nm_i, i);
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
kmem_cache_free(free_nid_slab, i);
- nm_i->fcnt--;
nr_shrink--;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
mutex_unlock(&nm_i->build_lock);
return nr - nr_shrink;
@@ -2005,7 +2041,7 @@ recover_xnid:
if (unlikely(!inc_valid_node_count(sbi, inode)))
f2fs_bug_on(sbi, 1);
- remove_free_nid(NM_I(sbi), new_xnid);
+ remove_free_nid(sbi, new_xnid);
get_node_info(sbi, new_xnid, &ni);
ni.ino = inode->i_ino;
set_node_addr(sbi, &ni, NEW_ADDR, false);
@@ -2035,7 +2071,7 @@ retry:
}
/* Should not use this inode from free nid list */
- remove_free_nid(NM_I(sbi), ino);
+ remove_free_nid(sbi, ino);
if (!PageUptodate(ipage))
SetPageUptodate(ipage);
@@ -2069,7 +2105,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
int i, idx, last_offset, nrpages;
/* scan the node segment */
@@ -2078,7 +2113,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, bio_blocks);
+ nrpages = min(last_offset - i, BIO_MAX_PAGES);
/* readahead node pages */
ra_meta_pages(sbi, addr, nrpages, META_POR, true);
@@ -2120,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
ne = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&ne->ni, &raw_ne);
}
+
+ /*
+ * if a free nat in journal has not been used after last
+ * checkpoint, we should remove it from available nids,
+ * since later we will add it again.
+ */
+ if (!get_nat_flag(ne, IS_DIRTY) &&
+ le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+ spin_lock(&nm_i->nid_list_lock);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
+ }
+
__set_nat_cache_dirty(nm_i, ne);
}
update_nats_in_cursum(journal, -i);
@@ -2192,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
raw_nat_from_node_info(raw_ne, &ne->ni);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- if (nat_get_blkaddr(ne) == NULL_ADDR)
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
add_free_nid(sbi, nid, false);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ NM_I(sbi)->available_nids++;
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
}
if (to_journal)
@@ -2268,21 +2320,24 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
- nm_i->fcnt = 0;
+ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+ F2FS_RESERVED_NODE_NUM;
+ nm_i->nid_cnt[FREE_NID_LIST] = 0;
+ nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
- INIT_LIST_HEAD(&nm_i->free_nid_list);
+ INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
+ INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);
mutex_init(&nm_i->build_lock);
- spin_lock_init(&nm_i->free_nid_list_lock);
+ spin_lock_init(&nm_i->nid_list_lock);
init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
@@ -2310,7 +2365,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
if (err)
return err;
- build_free_nids(sbi);
+ build_free_nids(sbi, true);
return 0;
}
@@ -2327,17 +2382,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
return;
/* destroy free nid list */
- spin_lock(&nm_i->free_nid_list_lock);
- list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(sbi, i->state == NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
}
- f2fs_bug_on(sbi, nm_i->fcnt);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
+ f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+ spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
down_write(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 868bec65e51c..e7997e240366 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- spin_lock(&nm_i->free_nid_list_lock);
- if (nm_i->fcnt <= 0) {
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+ spin_unlock(&nm_i->nid_list_lock);
return;
}
- fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next,
+ struct free_nid, list);
*nid = fnid->nid;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
}
/*
@@ -313,7 +314,7 @@ static inline bool is_recoverable_dnode(struct page *page)
((unsigned char *)ckpt + crc_offset)));
cp_ver |= (crc << 32);
}
- return cpu_to_le64(cp_ver) == cpver_of_node(page);
+ return cp_ver == cpver_of_node(page);
}
/*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 2fc84a991325..981a9584b62f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,13 +180,15 @@ static void recover_inode(struct inode *inode, struct page *page)
inode->i_mode = le16_to_cpu(raw->i_mode);
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ F2FS_I(inode)->i_advise = raw->i_advise;
+
if (file_enc_name(inode))
name = "<encrypted>";
else
@@ -196,32 +198,6 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
-static bool is_same_inode(struct inode *inode, struct page *ipage)
-{
- struct f2fs_inode *ri = F2FS_INODE(ipage);
- struct timespec disk;
-
- if (!IS_INODE(ipage))
- return true;
-
- disk.tv_sec = le64_to_cpu(ri->i_ctime);
- disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
- if (timespec_compare(&inode->i_ctime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_atime);
- disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- if (timespec_compare(&inode->i_atime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_mtime);
- disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
- if (timespec_compare(&inode->i_mtime, &disk) > 0)
- return false;
-
- return true;
-}
-
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
struct curseg_info *curseg;
@@ -248,10 +224,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (entry) {
- if (!is_same_inode(entry->inode, page))
- goto next;
- } else {
+ if (!entry) {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -454,7 +427,8 @@ retry_dn:
continue;
}
- if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+ if (!file_keep_isize(inode) &&
+ (i_size_read(inode) <= (start << PAGE_SHIFT)))
f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
/*
@@ -507,8 +481,10 @@ err:
f2fs_put_dnode(&dn);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
- "recover_data: ino = %lx, recovered = %d blocks, err = %d",
- inode->i_ino, recovered, err);
+ "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+ inode->i_ino,
+ file_keep_isize(inode) ? "keep" : "recover",
+ recovered, err);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fc886f008449..0d8802453758 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode,
.sbi = sbi,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
bool submit_bio = false;
@@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode,
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
fio.page = page;
err = do_write_data_page(&fio);
@@ -287,7 +289,6 @@ static int __commit_inmem_pages(struct inode *inode,
/* record old blkaddr for revoking */
cur->old_addr = fio.old_blkaddr;
- clear_cold_data(page);
submit_bio = true;
}
unlock_page(page);
@@ -363,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, false);
+ f2fs_gc(sbi, false, false);
}
}
@@ -380,14 +381,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
if (!available_free_memory(sbi, FREE_NIDS))
try_to_free_nids(sbi, MAX_FREE_NIDS);
else
- build_free_nids(sbi);
+ build_free_nids(sbi, false);
+
+ if (!is_idle(sbi))
+ return;
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
!available_free_memory(sbi, INO_ENTRIES) ||
excess_prefree_segs(sbi) ||
excess_dirty_nats(sbi) ||
- (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ f2fs_time_over(sbi, CP_TIME)) {
if (test_opt(sbi, DATA_FLUSH)) {
struct blk_plug plug;
@@ -400,6 +404,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
}
}
+static int __submit_flush_wait(struct block_device *bdev)
+{
+ struct bio *bio = f2fs_bio_alloc(0);
+ int ret;
+
+ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio->bi_bdev = bdev;
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+ int ret = __submit_flush_wait(sbi->sb->s_bdev);
+ int i;
+
+ if (sbi->s_ndevs && !ret) {
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ ret = __submit_flush_wait(FDEV(i).bdev);
+ if (ret)
+ break;
+ }
+ }
+ return ret;
+}
+
static int issue_flush_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
@@ -410,25 +441,18 @@ repeat:
return 0;
if (!llist_empty(&fcc->issue_list)) {
- struct bio *bio;
struct flush_cmd *cmd, *next;
int ret;
- bio = f2fs_bio_alloc(0);
-
fcc->dispatch_list = llist_del_all(&fcc->issue_list);
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
-
+ ret = submit_flush_wait(sbi);
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
complete(&cmd->wait);
}
- bio_put(bio);
fcc->dispatch_list = NULL;
}
@@ -449,15 +473,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
return 0;
if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
- struct bio *bio = f2fs_bio_alloc(0);
int ret;
atomic_inc(&fcc->submit_flush);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
+ ret = submit_flush_wait(sbi);
atomic_dec(&fcc->submit_flush);
- bio_put(bio);
return ret;
}
@@ -469,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
- wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->submit_flush);
+ if (fcc->f2fs_issue_flush) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->submit_flush);
+ } else {
+ llist_del_all(&fcc->issue_list);
+ atomic_set(&fcc->submit_flush, 0);
+ }
return cmd.ret;
}
@@ -481,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
struct flush_cmd_control *fcc;
int err = 0;
+ if (SM_I(sbi)->cmd_control_info) {
+ fcc = SM_I(sbi)->cmd_control_info;
+ goto init_thread;
+ }
+
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
@@ -488,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
+init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
@@ -500,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
return err;
}
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
{
struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
- if (fcc && fcc->f2fs_issue_flush)
- kthread_stop(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ if (fcc && fcc->f2fs_issue_flush) {
+ struct task_struct *flush_thread = fcc->f2fs_issue_flush;
+
+ fcc->f2fs_issue_flush = NULL;
+ kthread_stop(flush_thread);
+ }
+ if (free) {
+ kfree(fcc);
+ SM_I(sbi)->cmd_control_info = NULL;
+ }
}
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -633,15 +670,23 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
}
/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
{
- struct block_device *bdev = sbi->sb->s_bdev;
struct bio *bio = NULL;
int err;
- err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
- &bio);
+ trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ err = __blkdev_issue_discard(bdev,
+ SECTOR_FROM_BLOCK(blkstart),
+ SECTOR_FROM_BLOCK(blklen),
+ GFP_NOFS, 0, &bio);
if (!err && bio) {
struct bio_entry *be = __add_bio_entry(sbi, bio);
@@ -654,24 +699,101 @@ int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
return err;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
+ sector_t sector;
+ int devi = 0;
+
+ if (sbi->s_ndevs) {
+ devi = f2fs_target_device_index(sbi, blkstart);
+ blkstart -= FDEV(devi).start_blk;
+ }
+ sector = SECTOR_FROM_BLOCK(blkstart);
+
+ if (sector & (bdev_zone_sectors(bdev) - 1) ||
+ nr_sects != bdev_zone_sectors(bdev)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "(%d) %s: Unaligned discard attempted (block %x + %x)",
+ devi, sbi->s_ndevs ? FDEV(devi).path: "",
+ blkstart, blklen);
+ return -EIO;
+ }
+
+ /*
+ * We need to know the type of the zone: for conventional zones,
+ * use regular discard if the drive supports it. For sequential
+ * zones, reset the zone write pointer.
+ */
+ switch (get_blkz_type(sbi, bdev, blkstart)) {
+
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return 0;
+ return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
+ return blkdev_reset_zones(bdev, sector,
+ nr_sects, GFP_NOFS);
+ default:
+ /* Unknown zone type: broken device ? */
+ return -EIO;
+ }
+}
+#endif
+
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+ bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+ return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+ return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+}
+
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
- sector_t start = SECTOR_FROM_BLOCK(blkstart);
- sector_t len = SECTOR_FROM_BLOCK(blklen);
+ sector_t start = blkstart, len = 0;
+ struct block_device *bdev;
struct seg_entry *se;
unsigned int offset;
block_t i;
+ int err = 0;
+
+ bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+ for (i = blkstart; i < blkstart + blklen; i++, len++) {
+ if (i != start) {
+ struct block_device *bdev2 =
+ f2fs_target_device(sbi, i, NULL);
+
+ if (bdev2 != bdev) {
+ err = __issue_discard_async(sbi, bdev,
+ start, len);
+ if (err)
+ return err;
+ bdev = bdev2;
+ start = i;
+ len = 0;
+ }
+ }
- for (i = blkstart; i < blkstart + blklen; i++) {
se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
offset = GET_BLKOFF_FROM_SEG0(sbi, i);
if (!f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
}
- trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
- return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
+
+ if (len)
+ err = __issue_discard_async(sbi, bdev, start, len);
+ return err;
}
static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -1296,25 +1418,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
stat_inc_seg_type(sbi, curseg);
}
-static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
-{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int old_segno;
-
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
- locate_dirty_segment(sbi, old_segno);
-}
-
void allocate_new_segments(struct f2fs_sb_info *sbi)
{
+ struct curseg_info *curseg;
+ unsigned int old_segno;
int i;
if (test_opt(sbi, LFS))
return;
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segments(sbi, i);
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ curseg = CURSEG_I(sbi, i);
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_segno);
+ }
}
static const struct segment_allocation default_salloc_ops = {
@@ -1448,21 +1566,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_summary *sum, int type)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct curseg_info *curseg;
- bool direct_io = (type == CURSEG_DIRECT_IO);
-
- type = direct_io ? CURSEG_WARM_DATA : type;
-
- curseg = CURSEG_I(sbi, type);
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
- /* direct_io'ed data is aligned to the segment for better performance */
- if (direct_io && curseg->next_blkoff &&
- !has_not_enough_free_secs(sbi, 0, 0))
- __allocate_new_segments(sbi, type);
-
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/*
@@ -1515,7 +1623,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.sbi = sbi,
.type = META,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
.new_blkaddr = page->index,
.page = page,
@@ -2166,7 +2274,6 @@ out:
static int build_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct sit_info *sit_i;
unsigned int sit_segs, start;
char *src_bitmap, *dst_bitmap;
@@ -2233,7 +2340,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
- sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+ sit_i->written_valid_blocks = 0;
sit_i->sit_bitmap = dst_bitmap;
sit_i->bitmap_size = bitmap_size;
sit_i->dirty_sentries = 0;
@@ -2315,10 +2422,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
+ readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+ META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2387,6 +2494,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
+ else
+ SIT_I(sbi)->written_valid_blocks +=
+ sentry->valid_blocks;
}
/* set use the current segments */
@@ -2645,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
if (!sm_info)
return;
- destroy_flush_cmd_control(sbi);
+ destroy_flush_cmd_control(sbi, true);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index fecb856ad874..9d44ce83acb2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -18,6 +18,8 @@
#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
+#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
@@ -102,8 +104,6 @@
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi) \
- ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
if (test_opt(sbi, LFS))
return false;
- return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
reserved_sections(sbi) + 1);
}
@@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-
- node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
+ (node_secs + 2 * dent_secs + imeta_secs +
+ reserved_sections(sbi) + needed);
}
static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -695,13 +696,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
return false;
}
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(queue_max_sectors(q));
-}
-
/*
* It is very important to gather dirty pages and write at once, so that we can
* submit a big bio without interfering other data writes.
@@ -719,7 +713,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 8 * sbi->blocks_per_seg;
else if (type == META)
- return 8 * MAX_BIO_BLOCKS(sbi);
+ return 8 * BIO_MAX_PAGES;
else
return 0;
}
@@ -736,11 +730,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
return 0;
nr_to_write = wbc->nr_to_write;
-
+ desired = BIO_MAX_PAGES;
if (type == NODE)
- desired = 2 * max_hw_blocks(sbi);
- else
- desired = MAX_BIO_BLOCKS(sbi);
+ desired <<= 1;
wbc->nr_to_write = desired;
return desired - nr_to_write;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 46c915425923..5c60fc28ec75 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -21,14 +21,16 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
{
- return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+ long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
{
- if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
- return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
- return 0;
+ long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6132b4ce4e4c..46fd30d8af77 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -412,14 +412,20 @@ static int parse_options(struct super_block *sb, char *options)
q = bdev_get_queue(sb->s_bdev);
if (blk_queue_discard(q)) {
set_opt(sbi, DISCARD);
- } else {
+ } else if (!f2fs_sb_mounted_blkzoned(sb)) {
f2fs_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but "
"the device does not support discard");
}
break;
case Opt_nodiscard:
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "discard is required for zoned block devices");
+ return -EINVAL;
+ }
clear_opt(sbi, DISCARD);
+ break;
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
@@ -512,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options)
return -ENOMEM;
if (strlen(name) == 8 &&
!strncmp(name, "adaptive", 8)) {
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "adaptive mode is not allowed with "
+ "zoned block device feature");
+ kfree(name);
+ return -EINVAL;
+ }
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
@@ -558,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
- if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
- kmem_cache_free(f2fs_inode_cachep, fi);
- return NULL;
- }
-
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
+ atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
@@ -620,24 +629,25 @@ static int f2fs_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
-int f2fs_inode_dirtied(struct inode *inode)
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret = 0;
spin_lock(&sbi->inode_lock[DIRTY_META]);
if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
- spin_unlock(&sbi->inode_lock[DIRTY_META]);
- return 1;
+ ret = 1;
+ } else {
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
}
-
- set_inode_flag(inode, FI_DIRTY_INODE);
- list_add_tail(&F2FS_I(inode)->gdirty_list,
+ if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
&sbi->inode_list[DIRTY_META]);
- inc_page_count(sbi, F2FS_DIRTY_IMETA);
- stat_inc_dirty_inode(sbi, DIRTY_META);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
spin_unlock(&sbi->inode_lock[DIRTY_META]);
-
- return 0;
+ return ret;
}
void f2fs_inode_synced(struct inode *inode)
@@ -649,10 +659,12 @@ void f2fs_inode_synced(struct inode *inode)
spin_unlock(&sbi->inode_lock[DIRTY_META]);
return;
}
- list_del_init(&F2FS_I(inode)->gdirty_list);
+ if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
clear_inode_flag(inode, FI_DIRTY_INODE);
clear_inode_flag(inode, FI_AUTO_RECOVER);
- dec_page_count(sbi, F2FS_DIRTY_IMETA);
stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
}
@@ -676,7 +688,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
clear_inode_flag(inode, FI_AUTO_RECOVER);
- f2fs_inode_dirtied(inode);
+ f2fs_inode_dirtied(inode, false);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -687,20 +699,28 @@ static void f2fs_i_callback(struct rcu_head *head)
static void f2fs_destroy_inode(struct inode *inode)
{
- percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
call_rcu(&inode->i_rcu, f2fs_i_callback);
}
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- int i;
-
- for (i = 0; i < NR_COUNT_TYPE; i++)
- percpu_counter_destroy(&sbi->nr_pages[i]);
percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
}
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+ kfree(FDEV(i).blkz_type);
+#endif
+ }
+ kfree(sbi->devs);
+}
+
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -738,7 +758,6 @@ static void f2fs_put_super(struct super_block *sb)
* In addition, EIO will skip do checkpoint, we need this as well.
*/
release_ino_entry(sbi, true);
- release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
@@ -762,6 +781,8 @@ static void f2fs_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
+ destroy_device_list(sbi);
+
destroy_percpu_info(sbi);
kfree(sbi);
}
@@ -789,13 +810,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
static int f2fs_freeze(struct super_block *sb)
{
- int err;
-
if (f2fs_readonly(sb))
return 0;
- err = f2fs_sync_fs(sb, 1);
- return err;
+ /* IO error happened before */
+ if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ return -EIO;
+
+ /* must be clean, since sync_filesystem() was already called */
+ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ return -EINVAL;
+ return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
@@ -822,7 +847,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
- buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+ buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
+ buf->f_bavail);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -974,7 +1000,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, EXTENT_CACHE);
sbi->sb->s_flags |= MS_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
- if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+ if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
set_opt(sbi, DISCARD);
} else {
@@ -1076,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if flush_merge is not passed in mount option.
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
- destroy_flush_cmd_control(sbi);
- } else if (!SM_I(sbi)->cmd_control_info) {
+ clear_opt(sbi, FLUSH_MERGE);
+ destroy_flush_cmd_control(sbi, false);
+ } else {
err = create_flush_cmd_control(sbi);
if (err)
goto restore_gc;
@@ -1238,7 +1265,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
unlock_buffer(bh);
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1426,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned int ovp_segments, reserved_segments;
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1437,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
+ ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+ reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+ if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+ ovp_segments == 0 || reserved_segments == 0)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Wrong layout: check mkfs.f2fs version");
+ return 1;
+ }
+
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
@@ -1447,6 +1485,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
+ int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1471,6 +1510,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ atomic_set(&sbi->nr_pages[i], 0);
+
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
mutex_init(&sbi->wio_mutex[NODE]);
@@ -1486,13 +1528,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
static int init_percpu_info(struct f2fs_sb_info *sbi)
{
- int i, err;
-
- for (i = 0; i < NR_COUNT_TYPE; i++) {
- err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
- if (err)
- return err;
- }
+ int err;
err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
if (err)
@@ -1502,6 +1538,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
GFP_KERNEL);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
+{
+ struct block_device *bdev = FDEV(devi).bdev;
+ sector_t nr_sectors = bdev->bd_part->nr_sects;
+ sector_t sector = 0;
+ struct blk_zone *zones;
+ unsigned int i, nr_zones;
+ unsigned int n = 0;
+ int err = -EIO;
+
+ if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+ return 0;
+
+ if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+ SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
+ return -EINVAL;
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
+ if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+ __ilog2_u32(sbi->blocks_per_blkz))
+ return -EINVAL;
+ sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+ FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+ sbi->log_blocks_per_blkz;
+ if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
+ FDEV(devi).nr_blkz++;
+
+ FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+ if (!FDEV(devi).blkz_type)
+ return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES 4096
+
+ zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
+ GFP_KERNEL);
+ if (!zones)
+ return -ENOMEM;
+
+ /* Get block zones type */
+ while (zones && sector < nr_sectors) {
+
+ nr_zones = F2FS_REPORT_NR_ZONES;
+ err = blkdev_report_zones(bdev, sector,
+ zones, &nr_zones,
+ GFP_KERNEL);
+ if (err)
+ break;
+ if (!nr_zones) {
+ err = -EIO;
+ break;
+ }
+
+ for (i = 0; i < nr_zones; i++) {
+ FDEV(devi).blkz_type[n] = zones[i].type;
+ sector += zones[i].len;
+ n++;
+ }
+ }
+
+ kfree(zones);
+
+ return err;
+}
+#endif
+
/*
* Read f2fs raw super block.
* Because we have two copies of super block, so read both of them
@@ -1594,6 +1695,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
}
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ int i;
+
+ for (i = 0; i < MAX_DEVICES; i++) {
+ if (!RDEV(i).path[0])
+ return 0;
+
+ if (i == 0) {
+ sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
+ MAX_DEVICES, GFP_KERNEL);
+ if (!sbi->devs)
+ return -ENOMEM;
+ }
+
+ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+ FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
+ if (i == 0) {
+ FDEV(i).start_blk = 0;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
+ } else {
+ FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1;
+ }
+
+ FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+ sbi->sb->s_mode, sbi->sb->s_type);
+ if (IS_ERR(FDEV(i).bdev))
+ return PTR_ERR(FDEV(i).bdev);
+
+ /* to release errored devices */
+ sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+ !f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Zoned block device feature not enabled\n");
+ return -EINVAL;
+ }
+ if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (init_blkz_info(sbi, i)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Failed to initialize F2FS blkzone information");
+ return -EINVAL;
+ }
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+ "Host-aware" : "Host-managed");
+ continue;
+ }
+#endif
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk);
+ }
+ return 0;
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
@@ -1641,6 +1813,18 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ /*
+ * The BLKZONED feature indicates that the drive was formatted with
+ * zone alignment optimization. This is optional for host-aware
+ * devices, but mandatory for host-managed zoned block devices.
+ */
+#ifndef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Zoned block device support is not enabled\n");
+ goto free_sb_buf;
+ }
+#endif
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1710,6 +1894,13 @@ try_onemore:
goto free_meta_inode;
}
+ /* Initialize device list */
+ err = f2fs_scan_devices(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+ goto free_devices;
+ }
+
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1893,12 +2084,21 @@ free_node_inode:
mutex_lock(&sbi->umount_mutex);
release_ino_entry(sbi, true);
f2fs_leave_shrinker(sbi);
+ /*
+ * Some dirty meta pages can be produced by recover_orphan_inodes()
+ * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+ * followed by write_checkpoint() through f2fs_write_node_pages(), which
+ * falls into an infinite loop in sync_meta_pages().
+ */
+ truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
+free_devices:
+ destroy_device_list(sbi);
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
@@ -2044,3 +2244,4 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3e1c0280f866..c47ce2f330a1 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 350a2c8cfd28..e1c54f20325c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -25,7 +25,7 @@
#include <asm/poll.h>
#include <asm/siginfo.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
@@ -52,7 +52,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
- if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
+ if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
!filp->f_mapping->a_ops->direct_IO)
return -EINVAL;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ca3c3dd01789..5559168d5637 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -8,7 +8,7 @@
#include <linux/fs_struct.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
#include "mount.h"
diff --git a/fs/file_table.c b/fs/file_table.c
index ad17e05ebf95..6d982b57de92 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -155,7 +155,7 @@ over:
* @mode: the mode with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
-struct file *alloc_file(struct path *path, fmode_t mode,
+struct file *alloc_file(const struct path *path, fmode_t mode,
const struct file_operations *fop)
{
struct file *file;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index c5618db110be..cac75547d35c 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -14,7 +14,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* Handling of filesystem drivers list.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05713a5da083..ef600591d96f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
* become available for writeback. Otherwise
* we'll just busyloop.
*/
- if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb, work);
- inode = wb_inode(wb->b_more_io.prev);
- spin_lock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
- /* This function drops i_lock... */
- inode_sleep_on_writeback(inode);
- spin_lock(&wb->list_lock);
- }
+ trace_writeback_wait(wb, work);
+ inode = wb_inode(wb->b_more_io.prev);
+ spin_lock(&inode->i_lock);
+ spin_unlock(&wb->list_lock);
+ /* This function drops i_lock... */
+ inode_sleep_on_writeback(inode);
+ spin_lock(&wb->list_lock);
}
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 4304072161aa..40d61077bead 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
if (invalidate)
set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
}
} else {
@@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
TASK_UNINTERRUPTIBLE);
+ /* Make sure any pending writes are cancelled. */
+ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+ fscache_invalidate_writes(cookie);
+
/* Reset the cookie state if it wasn't relinquished */
if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
atomic_inc(&cookie->n_active);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 9b28649df3a1..a8aa00be4444 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
spin_lock_init(&cookie->lock);
+ spin_lock_init(&cookie->stores_lock);
INIT_HLIST_HEAD(&cookie->backing_objects);
/* check the netfs type is not already present */
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 9e792e30f4db..7a182c87f378 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object
static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_dead(struct fscache_object *, int);
#define __STATE_NAME(n) fscache_osm_##n
#define STATE(n) (&__STATE_NAME(n))
@@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
-static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
+static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead);
static WAIT_STATE(WAIT_FOR_INIT, "?INI",
TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
@@ -229,6 +230,10 @@ execute_work_state:
event = -1;
if (new_state == NO_TRANSIT) {
_debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+ if (unlikely(state == STATE(OBJECT_DEAD))) {
+ _leave(" [dead]");
+ return;
+ }
fscache_enqueue_object(object);
event_mask = object->oob_event_mask;
goto unmask_events;
@@ -239,7 +244,7 @@ execute_work_state:
object->state = state = new_state;
if (state->work) {
- if (unlikely(state->work == ((void *)2UL))) {
+ if (unlikely(state == STATE(OBJECT_DEAD))) {
_leave(" [dead]");
return;
}
@@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob
fscache_mark_object_dead(object);
object->oob_event_mask = 0;
+ if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) {
+ /* Reject any new read/write ops and abort any that are pending. */
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+ fscache_cancel_all_ops(object);
+ }
+
if (list_empty(&object->dependents) &&
object->n_ops == 0 &&
object->n_children == 0)
@@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object,
}
}
EXPORT_SYMBOL(fscache_object_mark_killed);
+
+/*
+ * The object is dead. We can get here if an object gets queued by an event
+ * that would lead to its death (such as EV_KILL) when the dispatcher is
+ * already running (and so can be requeued) but hasn't yet cleared the event
+ * mask.
+ */
+static const struct fscache_state *fscache_object_dead(struct fscache_object *object,
+ int event)
+{
+ if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD,
+ &object->flags))
+ return NO_TRANSIT;
+
+ WARN(true, "FS-Cache object redispatched after death");
+ return NO_TRANSIT;
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 70ea57c7b6bb..f11792672977 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->waitq.lock);
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ spin_unlock(&fiq->waitq.lock);
+ return;
+ }
if (list_empty(&req->intr_entry)) {
list_add_tail(&req->intr_entry, &fiq->interrupts);
wake_up_locked(&fiq->waitq);
@@ -1372,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
* code can Oops if the buffer persists after module unload.
*/
bufs[page_nr].ops = &nosteal_pipe_buf_ops;
+ bufs[page_nr].flags = 0;
ret = add_to_pipe(pipe, &bufs[page_nr++]);
if (unlikely(ret < 0))
break;
@@ -2025,7 +2030,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
struct fuse_req *req;
req = list_entry(head->next, struct fuse_req, list);
req->out.h.error = -ECONNABORTED;
- clear_bit(FR_PENDING, &req->flags);
clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
request_end(fc, req);
@@ -2103,6 +2107,8 @@ void fuse_abort_conn(struct fuse_conn *fc)
spin_lock(&fiq->waitq.lock);
fiq->connected = 0;
list_splice_init(&fiq->pending, &to_end2);
+ list_for_each_entry(req, &to_end2, list)
+ clear_bit(FR_PENDING, &req->flags);
while (forget_pending(fiq))
kfree(dequeue_forget(fiq, 1, NULL));
wake_up_all_locked(&fiq->waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 096f79997f75..811fd8929a18 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -68,7 +68,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
if (sec || nsec) {
struct timespec64 ts = {
sec,
- max_t(u32, nsec, NSEC_PER_SEC - 1)
+ min_t(u32, nsec, NSEC_PER_SEC - 1)
};
return get_jiffies_64() + timespec64_to_jiffies(&ts);
@@ -1831,7 +1831,6 @@ static const struct inode_operations fuse_common_inode_operations = {
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
.get_link = fuse_get_link,
- .readlink = generic_readlink,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5a6f52ea2722..6b039d7ce160 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -839,12 +839,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
kaddr = kmap_atomic(page);
memcpy(buf + pos, kaddr + pos, copied);
- memset(kaddr + pos + copied, 0, len - copied);
flush_dcache_page(page);
kunmap_atomic(kaddr);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ WARN_ON(!PageUptodate(page));
unlock_page(page);
put_page(page);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 3cdde5f5d399..79113219be5f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -62,6 +62,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/vmalloc.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e23ff70b3435..016c11eaca7c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -22,7 +22,7 @@
#include <linux/swap.h>
#include <linux/crc32.h>
#include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
#include <linux/delay.h>
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 14cbf60167a7..94f50cac91c6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -21,7 +21,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kthread.h>
@@ -695,7 +695,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_ops = glops;
- gl->gl_dstamp = ktime_set(0, 0);
+ gl->gl_dstamp = 0;
preempt_disable();
/* We use the global stats to estimate the initial per-glock stats */
gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fe3f84995c48..eb7724b8578a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -19,7 +19,7 @@
#include <linux/crc32.h>
#include <linux/fiemap.h>
#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "gfs2.h"
#include "incore.h"
@@ -2067,7 +2067,6 @@ const struct inode_operations gfs2_dir_iops = {
};
const struct inode_operations gfs2_symlink_iops = {
- .readlink = generic_readlink,
.get_link = gfs2_get_link,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e58ccef09c91..27c00a16def0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
struct gfs2_log_header *lh;
unsigned int tail;
u32 hash;
- int op_flags = WRITE_FLUSH_FUA | REQ_META;
+ int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
lh = page_address(page);
@@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
log_flush_wait(sdp);
- op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
+ op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
}
sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 49d5a1b61b06..b1f9144b42c7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio)
* gfs2_log_flush_bio - Submit any pending log bio
* @sdp: The superblock
* @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a59782..49db8ef13fdf 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_flags = REQ_META | REQ_PRIO |
- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
BUG_ON(!PageLocked(page));
BUG_ON(!page_has_buffers(page));
@@ -285,7 +284,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
}
}
- gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+ gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
@@ -453,7 +452,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff72ac6439c8..a34308df927f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c9ff1cf7d4f3..f8d30e41d1d3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/kobject.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/gfs2_ondisk.h>
#include <linux/genhd.h>
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index aee4485ad8a9..763d659db91b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -14,7 +14,7 @@
#include <linux/buffer_head.h>
#include <linux/crc32.h>
#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index a4a577088d19..d87721aeb575 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -14,7 +14,7 @@
#include <linux/xattr.h>
#include <linux/gfs2_ondisk.h>
#include <linux/posix_acl_xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4cdec5a19347..6d0783e2e276 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -23,7 +23,7 @@
#include <linux/workqueue.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "hfs.h"
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 99627f8a0a18..0a156d84e67d 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -16,7 +16,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "hfsplus_fs.h"
/*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 11854dd84572..67aedf4c2e7c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (!error)
error = error2;
if (!write_backup)
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (!error)
error2 = error;
out:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 23e15ea53e45..e61261a7417e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -920,7 +920,6 @@ static const char *hostfs_get_link(struct dentry *dentry,
}
static const struct inode_operations hostfs_link_iops = {
- .readlink = generic_readlink,
.get_link = hostfs_get_link,
};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fb7b10f3a05..54de77e78775 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -37,7 +37,7 @@
#include <linux/migrate.h>
#include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..b63cf3af2dc2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -62,7 +62,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
extern void *copy_mount_options(const void __user *);
extern char *copy_mount_string(const void __user *);
-extern struct vfsmount *lookup_mnt(struct path *);
+extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
extern int sb_prepare_remount_readonly(struct super_block *);
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap_ops *ops, void *data,
iomap_actor_t actor);
+
+/* direct-io.c: */
+int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index c415668c86d4..cb9b02940805 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -223,7 +223,11 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
if (!src_file.file)
return -EBADF;
- ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ ret = -EXDEV;
+ if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
+ goto fdput;
+ ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+fdput:
fdput(src_file);
return ret;
}
diff --git a/fs/iomap.c b/fs/iomap.c
index a8ee8c33ca78..a51cb4c07d4d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include "internal.h"
@@ -113,6 +114,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
BUG_ON(pos + len > iomap->offset + iomap->length);
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -467,8 +471,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
offset = page_offset(page);
while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
- ops, page, iomap_page_mkwrite_actor);
+ ret = iomap_apply(inode, offset, length,
+ IOMAP_WRITE | IOMAP_FAULT, ops, page,
+ iomap_page_mkwrite_actor);
if (unlikely(ret <= 0))
goto out_unlock;
offset += ret;
@@ -583,3 +588,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE (1 << 30)
+#define IOMAP_DIO_DIRTY (1 << 31)
+
+struct iomap_dio {
+ struct kiocb *iocb;
+ iomap_dio_end_io_t *end_io;
+ loff_t i_size;
+ loff_t size;
+ atomic_t ref;
+ unsigned flags;
+ int error;
+
+ union {
+ /* used during submission and for synchronous completion: */
+ struct {
+ struct iov_iter *iter;
+ struct task_struct *waiter;
+ struct request_queue *last_queue;
+ blk_qc_t cookie;
+ } submit;
+
+ /* used for aio completion: */
+ struct {
+ struct work_struct work;
+ } aio;
+ };
+};
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ if (dio->end_io) {
+ ret = dio->end_io(iocb,
+ dio->error ? dio->error : dio->size,
+ dio->flags);
+ } else {
+ ret = dio->error;
+ }
+
+ if (likely(!ret)) {
+ ret = dio->size;
+ /* check for short read */
+ if (iocb->ki_pos + ret > dio->i_size &&
+ !(dio->flags & IOMAP_DIO_WRITE))
+ ret = dio->i_size - iocb->ki_pos;
+ iocb->ki_pos += ret;
+ }
+
+ inode_dio_end(file_inode(iocb->ki_filp));
+ kfree(dio);
+
+ return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+ struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+ struct kiocb *iocb = dio->iocb;
+ bool is_write = (dio->flags & IOMAP_DIO_WRITE);
+ ssize_t ret;
+
+ ret = iomap_dio_complete(dio);
+ if (is_write && ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ iocb->ki_complete(iocb, ret, 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet. We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+ cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_error)
+ iomap_dio_set_error(dio, bio->bi_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ if (is_sync_kiocb(dio->iocb)) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ wake_up_process(waiter);
+ } else if (dio->flags & IOMAP_DIO_WRITE) {
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+ } else {
+ iomap_dio_complete_work(&dio->aio.work);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+}
+
+static blk_qc_t
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+ unsigned len)
+{
+ struct page *page = ZERO_PAGE(0);
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ get_page(page);
+ if (bio_add_page(bio, page, len, 0) != len)
+ BUG();
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+
+ atomic_inc(&dio->ref);
+ return submit_bio(bio);
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct iomap_dio *dio = data;
+ unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+ unsigned fs_block_size = (1 << inode->i_blkbits), pad;
+ unsigned align = iov_iter_alignment(dio->submit.iter);
+ struct iov_iter iter;
+ struct bio *bio;
+ bool need_zeroout = false;
+ int nr_pages, ret;
+
+ if ((pos | length | align) & ((1 << blkbits) - 1))
+ return -EINVAL;
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+ return -EIO;
+ /*FALLTHRU*/
+ case IOMAP_UNWRITTEN:
+ if (!(dio->flags & IOMAP_DIO_WRITE)) {
+ iov_iter_zero(length, dio->submit.iter);
+ dio->size += length;
+ return length;
+ }
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ break;
+ case IOMAP_MAPPED:
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+ if (iomap->flags & IOMAP_F_NEW)
+ need_zeroout = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ /*
+ * Operate on a partial iter trimmed to the extent we were called for.
+ * We'll update the iter in the dio once we're done with this extent.
+ */
+ iter = *dio->submit.iter;
+ iov_iter_truncate(&iter, length);
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+ if (nr_pages <= 0)
+ return nr_pages;
+
+ if (need_zeroout) {
+ /* zero out from the start of the block to the write offset */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos - pad, pad);
+ }
+
+ do {
+ if (dio->error)
+ return 0;
+
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ ret = bio_iov_iter_get_pages(bio, &iter);
+ if (unlikely(ret)) {
+ bio_put(bio);
+ return ret;
+ }
+
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ task_io_account_write(bio->bi_iter.bi_size);
+ } else {
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (dio->flags & IOMAP_DIO_DIRTY)
+ bio_set_pages_dirty(bio);
+ }
+
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+
+ atomic_inc(&dio->ref);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+ } while (nr_pages);
+
+ if (need_zeroout) {
+ /* zero out from the end of the write to the end of the block */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+ }
+
+ iov_iter_advance(dio->submit.iter, length);
+ return length;
+}
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+ iomap_dio_end_io_t end_io)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+ unsigned int flags = IOMAP_DIRECT;
+ struct blk_plug plug;
+ struct iomap_dio *dio;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ if (!count)
+ return 0;
+
+ dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+
+ dio->iocb = iocb;
+ atomic_set(&dio->ref, 1);
+ dio->size = 0;
+ dio->i_size = i_size_read(inode);
+ dio->end_io = end_io;
+ dio->error = 0;
+ dio->flags = 0;
+
+ dio->submit.iter = iter;
+ if (is_sync_kiocb(iocb)) {
+ dio->submit.waiter = current;
+ dio->submit.cookie = BLK_QC_T_NONE;
+ dio->submit.last_queue = NULL;
+ }
+
+ if (iov_iter_rw(iter) == READ) {
+ if (pos >= dio->i_size)
+ goto out_free_dio;
+
+ if (iter->type == ITER_IOVEC)
+ dio->flags |= IOMAP_DIO_DIRTY;
+ } else {
+ dio->flags |= IOMAP_DIO_WRITE;
+ flags |= IOMAP_WRITE;
+ }
+
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_free_dio;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
+ }
+
+ inode_dio_begin(inode);
+
+ blk_start_plug(&plug);
+ do {
+ ret = iomap_apply(inode, pos, count, flags, ops, dio,
+ iomap_dio_actor);
+ if (ret <= 0) {
+ /* magic error code to fall back to buffered I/O */
+ if (ret == -ENOTBLK)
+ ret = 0;
+ break;
+ }
+ pos += ret;
+ } while ((count = iov_iter_count(iter)) > 0);
+ blk_finish_plug(&plug);
+
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+
+ if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+ !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+ }
+
+ if (!atomic_dec_and_test(&dio->ref)) {
+ if (!is_sync_kiocb(iocb))
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->submit.waiter))
+ break;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !dio->submit.last_queue ||
+ !blk_mq_poll(dio->submit.last_queue,
+ dio->submit.cookie))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ */
+ if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
+ return iomap_dio_complete(dio);
+
+out_free_dio:
+ kfree(dio);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 44af14b2e916..9bb2fe35799d 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/bio.h>
#include <linux/vmalloc.h>
#include <linux/zlib.h>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 684996c8a3a4..4055f51617ef 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count)
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+ write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 31f8ca046639..8c514367ba5a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+ ret = submit_bh(REQ_OP_WRITE,
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
else
- ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
*cbh = bh;
return ret;
@@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- WRITE_SYNC);
+ REQ_SYNC);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd_debug(3, "superblock not updated\n");
@@ -717,7 +718,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 927da4956a89..a097048ed1a3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -47,7 +47,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
#ifdef CONFIG_JBD2_DEBUG
@@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
if (ret)
goto out;
@@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal)
/* Lock here to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
/*
- * Update log tail information. We use WRITE_FUA since new
+ * Update log tail information. We use REQ_FUA since new
* transaction will start reusing journal space and so we
* must make sure information about current log tail is on
* disk before that.
@@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- WRITE_FUA);
+ REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
@@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_FUA);
+ jbd2_write_superblock(journal, REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal)
++journal->j_transaction_sequence;
write_unlock(&journal->j_state_lock);
- jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
+ jbd2_mark_journal_empty(journal,
+ REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, WRITE_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, WRITE_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 91171dc352cb..cfc38b552118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal,
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
- write_dirty_buffer(descriptor, WRITE_SYNC);
+ write_dirty_buffer(descriptor, REQ_SYNC);
}
#endif
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8f3f0855fcd2..d2fa138a868c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -13,7 +13,6 @@
const struct inode_operations jffs2_symlink_inode_operations =
{
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8653cac7e12e..fc89f9436784 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -13,7 +13,7 @@
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "jfs_filsys.h"
#include "jfs_debug.h"
@@ -121,7 +121,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
jfs_set_inode_flags(inode);
inode_unlock(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
setflags_out:
mnt_drop_write_file(filp);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index a37eb5f8cbc0..a70907606025 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -22,7 +22,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_debug.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21ea8b3e5fa..bb1da1feafeb 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85671f7f8518..2be7c9ce6663 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -31,7 +31,7 @@
#include <linux/exportfs.h>
#include <linux/crc32.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index c82404fee6cd..38320607993e 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -22,14 +22,12 @@
#include "jfs_xattr.h"
const struct inode_operations jfs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = jfs_setattr,
.listxattr = jfs_listxattr,
};
const struct inode_operations jfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = jfs_setattr,
.listxattr = jfs_listxattr,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a1982118f92f..ac9e108ce1ea 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -335,7 +335,7 @@ static int kernfs_xattr_set(const struct xattr_handler *handler,
return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
}
-const struct xattr_handler kernfs_trusted_xattr_handler = {
+static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_xattr_get,
.set = kernfs_xattr_set,
@@ -372,7 +372,7 @@ static int kernfs_security_xattr_set(const struct xattr_handler *handler,
return error;
}
-const struct xattr_handler kernfs_security_xattr_handler = {
+static const struct xattr_handler kernfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = kernfs_xattr_get,
.set = kernfs_security_xattr_set,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 9b43ca02b7ab..1684af4a8b9b 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -135,7 +135,6 @@ static const char *kernfs_iop_get_link(struct dentry *dentry,
const struct inode_operations kernfs_symlink_iops = {
.listxattr = kernfs_iop_listxattr,
- .readlink = generic_readlink,
.get_link = kernfs_iop_get_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
diff --git a/fs/libfs.c b/fs/libfs.c
index 48826d4da189..28d6f35feed6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,7 +16,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
struct inode *root;
struct qstr d_name = QSTR_INIT(name, strlen(name));
- s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
+ s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+ &init_user_ns, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
@@ -465,6 +466,8 @@ EXPORT_SYMBOL(simple_write_begin);
* is not called, so a filesystem that actually does store data in .write_inode
* should extend on what's done here with a call to mark_inode_dirty() in the
* case that i_size has changed.
+ *
+ * Use *ONLY* with simple_readpage()
*/
int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -474,14 +477,14 @@ int simple_write_end(struct file *file, struct address_space *mapping,
loff_t last_pos = pos + copied;
/* zero the stale part of the page if we did a short copy */
- if (copied < len) {
- unsigned from = pos & (PAGE_SIZE - 1);
-
- zero_user(page, from + copied, len - copied);
- }
+ if (!PageUptodate(page)) {
+ if (copied < len) {
+ unsigned from = pos & (PAGE_SIZE - 1);
- if (!PageUptodate(page))
+ zero_user(page, from + copied, len - copied);
+ }
SetPageUptodate(page);
+ }
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
@@ -1129,7 +1132,6 @@ EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
.get_link = simple_get_link,
- .readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5426189406c1..fb8cac88251a 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,6 @@ struct lockd_net {
struct list_head nsm_handles;
};
-extern int lockd_net_id;
+extern unsigned int lockd_net_id;
#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fc4084ef4736..1c13dd80744f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,7 +57,7 @@ static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
-int lockd_net_id;
+unsigned int lockd_net_id;
/*
* These can be set at insmod time (useful for NFS as root filesystem),
diff --git a/fs/locks.c b/fs/locks.c
index 22c5b4aa4961..26811321d39b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -131,7 +131,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
deleted file mode 100644
index 2b4503163930..000000000000
--- a/fs/logfs/Kconfig
+++ /dev/null
@@ -1,17 +0,0 @@
-config LOGFS
- tristate "LogFS file system"
- depends on MTD || (!MTD && BLOCK)
- select ZLIB_INFLATE
- select ZLIB_DEFLATE
- select CRC32
- select BTREE
- help
- Flash filesystem aimed to scale efficiently to large devices.
- In comparison to JFFS2 it offers significantly faster mount
- times and potentially less RAM usage, although the latter has
- not been measured yet.
-
- In its current state it is still very experimental and should
- not be used for other than testing purposes.
-
- If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
deleted file mode 100644
index 4820027787ee..000000000000
--- a/fs/logfs/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-obj-$(CONFIG_LOGFS) += logfs.o
-
-logfs-y += compr.o
-logfs-y += dir.o
-logfs-y += file.o
-logfs-y += gc.o
-logfs-y += inode.o
-logfs-y += journal.o
-logfs-y += readwrite.o
-logfs-y += segment.o
-logfs-y += super.o
-logfs-$(CONFIG_BLOCK) += dev_bdev.o
-logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
deleted file mode 100644
index 961f02b86d97..000000000000
--- a/fs/logfs/compr.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * fs/logfs/compr.c - compression routines
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/vmalloc.h>
-#include <linux/zlib.h>
-
-#define COMPR_LEVEL 3
-
-static DEFINE_MUTEX(compr_mutex);
-static struct z_stream_s stream;
-
-int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
-{
- int err, ret;
-
- ret = -EIO;
- mutex_lock(&compr_mutex);
- err = zlib_deflateInit(&stream, COMPR_LEVEL);
- if (err != Z_OK)
- goto error;
-
- stream.next_in = in;
- stream.avail_in = inlen;
- stream.total_in = 0;
- stream.next_out = out;
- stream.avail_out = outlen;
- stream.total_out = 0;
-
- err = zlib_deflate(&stream, Z_FINISH);
- if (err != Z_STREAM_END)
- goto error;
-
- err = zlib_deflateEnd(&stream);
- if (err != Z_OK)
- goto error;
-
- if (stream.total_out >= stream.total_in)
- goto error;
-
- ret = stream.total_out;
-error:
- mutex_unlock(&compr_mutex);
- return ret;
-}
-
-int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
-{
- int err, ret;
-
- ret = -EIO;
- mutex_lock(&compr_mutex);
- err = zlib_inflateInit(&stream);
- if (err != Z_OK)
- goto error;
-
- stream.next_in = in;
- stream.avail_in = inlen;
- stream.total_in = 0;
- stream.next_out = out;
- stream.avail_out = outlen;
- stream.total_out = 0;
-
- err = zlib_inflate(&stream, Z_FINISH);
- if (err != Z_STREAM_END)
- goto error;
-
- err = zlib_inflateEnd(&stream);
- if (err != Z_OK)
- goto error;
-
- ret = 0;
-error:
- mutex_unlock(&compr_mutex);
- return ret;
-}
-
-int __init logfs_compr_init(void)
-{
- size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
- zlib_inflate_workspacesize());
- stream.workspace = vmalloc(size);
- if (!stream.workspace)
- return -ENOMEM;
- return 0;
-}
-
-void logfs_compr_exit(void)
-{
- vfree(stream.workspace);
-}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
deleted file mode 100644
index a8329cc47dec..000000000000
--- a/fs/logfs/dev_bdev.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * fs/logfs/dev_bdev.c - Device access methods for block devices
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/prefetch.h>
-
-#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-
-static int sync_request(struct page *page, struct block_device *bdev, int op)
-{
- struct bio bio;
- struct bio_vec bio_vec;
-
- bio_init(&bio);
- bio.bi_max_vecs = 1;
- bio.bi_io_vec = &bio_vec;
- bio_vec.bv_page = page;
- bio_vec.bv_len = PAGE_SIZE;
- bio_vec.bv_offset = 0;
- bio.bi_vcnt = 1;
- bio.bi_bdev = bdev;
- bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
- bio.bi_iter.bi_size = PAGE_SIZE;
- bio_set_op_attrs(&bio, op, 0);
-
- return submit_bio_wait(&bio);
-}
-
-static int bdev_readpage(void *_sb, struct page *page)
-{
- struct super_block *sb = _sb;
- struct block_device *bdev = logfs_super(sb)->s_bdev;
- int err;
-
- err = sync_request(page, bdev, READ);
- if (err) {
- ClearPageUptodate(page);
- SetPageError(page);
- } else {
- SetPageUptodate(page);
- ClearPageError(page);
- }
- unlock_page(page);
- return err;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(wq);
-
-static void writeseg_end_io(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
- struct super_block *sb = bio->bi_private;
- struct logfs_super *super = logfs_super(sb);
-
- BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
-
- bio_for_each_segment_all(bvec, bio, i) {
- end_page_writeback(bvec->bv_page);
- put_page(bvec->bv_page);
- }
- bio_put(bio);
- if (atomic_dec_and_test(&super->s_pending_writes))
- wake_up(&wq);
-}
-
-static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
- size_t nr_pages)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- struct bio *bio;
- struct page *page;
- unsigned int max_pages;
- int i;
-
- max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
-
- bio = bio_alloc(GFP_NOFS, max_pages);
- BUG_ON(!bio);
-
- for (i = 0; i < nr_pages; i++) {
- if (i >= max_pages) {
- /* Block layer cannot split bios :( */
- bio->bi_vcnt = i;
- bio->bi_iter.bi_size = i * PAGE_SIZE;
- bio->bi_bdev = super->s_bdev;
- bio->bi_iter.bi_sector = ofs >> 9;
- bio->bi_private = sb;
- bio->bi_end_io = writeseg_end_io;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- atomic_inc(&super->s_pending_writes);
- submit_bio(bio);
-
- ofs += i * PAGE_SIZE;
- index += i;
- nr_pages -= i;
- i = 0;
-
- bio = bio_alloc(GFP_NOFS, max_pages);
- BUG_ON(!bio);
- }
- page = find_lock_page(mapping, index + i);
- BUG_ON(!page);
- bio->bi_io_vec[i].bv_page = page;
- bio->bi_io_vec[i].bv_len = PAGE_SIZE;
- bio->bi_io_vec[i].bv_offset = 0;
-
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
- }
- bio->bi_vcnt = nr_pages;
- bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
- bio->bi_bdev = super->s_bdev;
- bio->bi_iter.bi_sector = ofs >> 9;
- bio->bi_private = sb;
- bio->bi_end_io = writeseg_end_io;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- atomic_inc(&super->s_pending_writes);
- submit_bio(bio);
- return 0;
-}
-
-static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
-{
- struct logfs_super *super = logfs_super(sb);
- int head;
-
- BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
-
- if (len == 0) {
- /* This can happen when the object fit perfectly into a
- * segment, the segment gets written per sync and subsequently
- * closed.
- */
- return;
- }
- head = ofs & (PAGE_SIZE - 1);
- if (head) {
- ofs -= head;
- len += head;
- }
- len = PAGE_ALIGN(len);
- __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-}
-
-
-static void erase_end_io(struct bio *bio)
-{
- struct super_block *sb = bio->bi_private;
- struct logfs_super *super = logfs_super(sb);
-
- BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
- BUG_ON(bio->bi_vcnt == 0);
- bio_put(bio);
- if (atomic_dec_and_test(&super->s_pending_writes))
- wake_up(&wq);
-}
-
-static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
- size_t nr_pages)
-{
- struct logfs_super *super = logfs_super(sb);
- struct bio *bio;
- unsigned int max_pages;
- int i;
-
- max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
-
- bio = bio_alloc(GFP_NOFS, max_pages);
- BUG_ON(!bio);
-
- for (i = 0; i < nr_pages; i++) {
- if (i >= max_pages) {
- /* Block layer cannot split bios :( */
- bio->bi_vcnt = i;
- bio->bi_iter.bi_size = i * PAGE_SIZE;
- bio->bi_bdev = super->s_bdev;
- bio->bi_iter.bi_sector = ofs >> 9;
- bio->bi_private = sb;
- bio->bi_end_io = erase_end_io;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- atomic_inc(&super->s_pending_writes);
- submit_bio(bio);
-
- ofs += i * PAGE_SIZE;
- index += i;
- nr_pages -= i;
- i = 0;
-
- bio = bio_alloc(GFP_NOFS, max_pages);
- BUG_ON(!bio);
- }
- bio->bi_io_vec[i].bv_page = super->s_erase_page;
- bio->bi_io_vec[i].bv_len = PAGE_SIZE;
- bio->bi_io_vec[i].bv_offset = 0;
- }
- bio->bi_vcnt = nr_pages;
- bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
- bio->bi_bdev = super->s_bdev;
- bio->bi_iter.bi_sector = ofs >> 9;
- bio->bi_private = sb;
- bio->bi_end_io = erase_end_io;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- atomic_inc(&super->s_pending_writes);
- submit_bio(bio);
- return 0;
-}
-
-static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
- int ensure_write)
-{
- struct logfs_super *super = logfs_super(sb);
-
- BUG_ON(to & (PAGE_SIZE - 1));
- BUG_ON(len & (PAGE_SIZE - 1));
-
- if (super->s_flags & LOGFS_SB_FLAG_RO)
- return -EROFS;
-
- if (ensure_write) {
- /*
- * Object store doesn't care whether erases happen or not.
- * But for the journal they are required. Otherwise a scan
- * can find an old commit entry and assume it is the current
- * one, travelling back in time.
- */
- do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
- }
-
- return 0;
-}
-
-static void bdev_sync(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
-}
-
-static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = bdev_readpage;
-
- *ofs = 0;
- return read_cache_page(mapping, 0, filler, sb);
-}
-
-static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = bdev_readpage;
- u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
- pgoff_t index = pos >> PAGE_SHIFT;
-
- *ofs = pos;
- return read_cache_page(mapping, index, filler, sb);
-}
-
-static int bdev_write_sb(struct super_block *sb, struct page *page)
-{
- struct block_device *bdev = logfs_super(sb)->s_bdev;
-
- /* Nothing special to do for block devices. */
- return sync_request(page, bdev, WRITE);
-}
-
-static void bdev_put_device(struct logfs_super *s)
-{
- blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
-{
- return 0;
-}
-
-static const struct logfs_device_ops bd_devops = {
- .find_first_sb = bdev_find_first_sb,
- .find_last_sb = bdev_find_last_sb,
- .write_sb = bdev_write_sb,
- .readpage = bdev_readpage,
- .writeseg = bdev_writeseg,
- .erase = bdev_erase,
- .can_write_buf = bdev_can_write_buf,
- .sync = bdev_sync,
- .put_device = bdev_put_device,
-};
-
-int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
- const char *devname)
-{
- struct block_device *bdev;
-
- bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
- type);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
-
- if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
- int mtdnr = MINOR(bdev->bd_dev);
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
- return logfs_get_sb_mtd(p, mtdnr);
- }
-
- p->s_bdev = bdev;
- p->s_mtd = NULL;
- p->s_devops = &bd_devops;
- return 0;
-}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
deleted file mode 100644
index b76a62b1978f..000000000000
--- a/fs/logfs/dev_mtd.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * fs/logfs/dev_mtd.c - Device access methods for MTD
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/completion.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-
-static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
- void *buf)
-{
- struct mtd_info *mtd = logfs_super(sb)->s_mtd;
- size_t retlen;
- int ret;
-
- ret = mtd_read(mtd, ofs, len, &retlen, buf);
- BUG_ON(ret == -EINVAL);
- if (ret)
- return ret;
-
- /* Not sure if we should loop instead. */
- if (retlen != len)
- return -EIO;
-
- return 0;
-}
-
-static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
- void *buf)
-{
- struct logfs_super *super = logfs_super(sb);
- struct mtd_info *mtd = super->s_mtd;
- size_t retlen;
- loff_t page_start, page_end;
- int ret;
-
- if (super->s_flags & LOGFS_SB_FLAG_RO)
- return -EROFS;
-
- BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
- BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
- BUG_ON(len > PAGE_SIZE);
- page_start = ofs & PAGE_MASK;
- page_end = PAGE_ALIGN(ofs + len) - 1;
- ret = mtd_write(mtd, ofs, len, &retlen, buf);
- if (ret || (retlen != len))
- return -EIO;
-
- return 0;
-}
-
-/*
- * For as long as I can remember (since about 2001) mtd->erase has been an
- * asynchronous interface lacking the first driver to actually use the
- * asynchronous properties. So just to prevent the first implementor of such
- * a thing from breaking logfs in 2350, we do the usual pointless dance to
- * declare a completion variable and wait for completion before returning
- * from logfs_mtd_erase(). What an exercise in futility!
- */
-static void logfs_erase_callback(struct erase_info *ei)
-{
- complete((struct completion *)ei->priv);
-}
-
-static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
- size_t len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- struct page *page;
- pgoff_t index = ofs >> PAGE_SHIFT;
-
- for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
- page = find_get_page(mapping, index);
- if (!page)
- continue;
- memset(page_address(page), 0xFF, PAGE_SIZE);
- put_page(page);
- }
- return 0;
-}
-
-static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
- int ensure_write)
-{
- struct mtd_info *mtd = logfs_super(sb)->s_mtd;
- struct erase_info ei;
- DECLARE_COMPLETION_ONSTACK(complete);
- int ret;
-
- BUG_ON(len % mtd->erasesize);
- if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
- return -EROFS;
-
- memset(&ei, 0, sizeof(ei));
- ei.mtd = mtd;
- ei.addr = ofs;
- ei.len = len;
- ei.callback = logfs_erase_callback;
- ei.priv = (long)&complete;
- ret = mtd_erase(mtd, &ei);
- if (ret)
- return -EIO;
-
- wait_for_completion(&complete);
- if (ei.state != MTD_ERASE_DONE)
- return -EIO;
- return logfs_mtd_erase_mapping(sb, ofs, len);
-}
-
-static void logfs_mtd_sync(struct super_block *sb)
-{
- struct mtd_info *mtd = logfs_super(sb)->s_mtd;
-
- mtd_sync(mtd);
-}
-
-static int logfs_mtd_readpage(void *_sb, struct page *page)
-{
- struct super_block *sb = _sb;
- int err;
-
- err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
- page_address(page));
- if (err == -EUCLEAN || err == -EBADMSG) {
- /* -EBADMSG happens regularly on power failures */
- err = 0;
- /* FIXME: force GC this segment */
- }
- if (err) {
- ClearPageUptodate(page);
- SetPageError(page);
- } else {
- SetPageUptodate(page);
- ClearPageError(page);
- }
- unlock_page(page);
- return err;
-}
-
-static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = logfs_mtd_readpage;
- struct mtd_info *mtd = super->s_mtd;
-
- *ofs = 0;
- while (mtd_block_isbad(mtd, *ofs)) {
- *ofs += mtd->erasesize;
- if (*ofs >= mtd->size)
- return NULL;
- }
- BUG_ON(*ofs & ~PAGE_MASK);
- return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
-}
-
-static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = logfs_mtd_readpage;
- struct mtd_info *mtd = super->s_mtd;
-
- *ofs = mtd->size - mtd->erasesize;
- while (mtd_block_isbad(mtd, *ofs)) {
- *ofs -= mtd->erasesize;
- if (*ofs <= 0)
- return NULL;
- }
- *ofs = *ofs + mtd->erasesize - 0x1000;
- BUG_ON(*ofs & ~PAGE_MASK);
- return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
-}
-
-static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
- size_t nr_pages)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- struct page *page;
- int i, err;
-
- for (i = 0; i < nr_pages; i++) {
- page = find_lock_page(mapping, index + i);
- BUG_ON(!page);
-
- err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
- page_address(page));
- unlock_page(page);
- put_page(page);
- if (err)
- return err;
- }
- return 0;
-}
-
-static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
-{
- struct logfs_super *super = logfs_super(sb);
- int head;
-
- if (super->s_flags & LOGFS_SB_FLAG_RO)
- return;
-
- if (len == 0) {
- /* This can happen when the object fit perfectly into a
- * segment, the segment gets written per sync and subsequently
- * closed.
- */
- return;
- }
- head = ofs & (PAGE_SIZE - 1);
- if (head) {
- ofs -= head;
- len += head;
- }
- len = PAGE_ALIGN(len);
- __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-}
-
-static void logfs_mtd_put_device(struct logfs_super *s)
-{
- put_mtd_device(s->s_mtd);
-}
-
-static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- void *buf;
- int err;
-
- buf = kmalloc(super->s_writesize, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
- if (err)
- goto out;
- if (memchr_inv(buf, 0xff, super->s_writesize))
- err = -EIO;
- kfree(buf);
-out:
- return err;
-}
-
-static const struct logfs_device_ops mtd_devops = {
- .find_first_sb = logfs_mtd_find_first_sb,
- .find_last_sb = logfs_mtd_find_last_sb,
- .readpage = logfs_mtd_readpage,
- .writeseg = logfs_mtd_writeseg,
- .erase = logfs_mtd_erase,
- .can_write_buf = logfs_mtd_can_write_buf,
- .sync = logfs_mtd_sync,
- .put_device = logfs_mtd_put_device,
-};
-
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-{
- struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
- if (IS_ERR(mtd))
- return PTR_ERR(mtd);
-
- s->s_bdev = NULL;
- s->s_mtd = mtd;
- s->s_devops = &mtd_devops;
- return 0;
-}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
deleted file mode 100644
index c87ea52de3d9..000000000000
--- a/fs/logfs/dir.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * fs/logfs/dir.c - directory-related code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-/*
- * Atomic dir operations
- *
- * Directory operations are by default not atomic. Dentries and Inodes are
- * created/removed/altered in separate operations. Therefore we need to do
- * a small amount of journaling.
- *
- * Create, link, mkdir, mknod and symlink all share the same function to do
- * the work: __logfs_create. This function works in two atomic steps:
- * 1. allocate inode (remember in journal)
- * 2. allocate dentry (clear journal)
- *
- * As we can only get interrupted between the two, when the inode we just
- * created is simply stored in the anchor. On next mount, if we were
- * interrupted, we delete the inode. From a users point of view the
- * operation never happened.
- *
- * Unlink and rmdir also share the same function: unlink. Again, this
- * function works in two atomic steps
- * 1. remove dentry (remember inode in journal)
- * 2. unlink inode (clear journal)
- *
- * And again, on the next mount, if we were interrupted, we delete the inode.
- * From a users point of view the operation succeeded.
- *
- * Rename is the real pain to deal with, harder than all the other methods
- * combined. Depending on the circumstances we can run into three cases.
- * A "target rename" where the target dentry already existed, a "local
- * rename" where both parent directories are identical or a "cross-directory
- * rename" in the remaining case.
- *
- * Local rename is atomic, as the old dentry is simply rewritten with a new
- * name.
- *
- * Cross-directory rename works in two steps, similar to __logfs_create and
- * logfs_unlink:
- * 1. Write new dentry (remember old dentry in journal)
- * 2. Remove old dentry (clear journal)
- *
- * Here we remember a dentry instead of an inode. On next mount, if we were
- * interrupted, we delete the dentry. From a users point of view, the
- * operation succeeded.
- *
- * Target rename works in three atomic steps:
- * 1. Attach old inode to new dentry (remember old dentry and new inode)
- * 2. Remove old dentry (still remember the new inode)
- * 3. Remove victim inode
- *
- * Here we remember both an inode an a dentry. If we get interrupted
- * between steps 1 and 2, we delete both the dentry and the inode. If
- * we get interrupted between steps 2 and 3, we delete just the inode.
- * In either case, the remaining objects are deleted on next mount. From
- * a users point of view, the operation succeeded.
- */
-
-static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
- loff_t pos)
-{
- return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
-}
-
-static int write_inode(struct inode *inode)
-{
- return __logfs_write_inode(inode, NULL, WF_LOCK);
-}
-
-static s64 dir_seek_data(struct inode *inode, s64 pos)
-{
- s64 new_pos = logfs_seek_data(inode, pos);
-
- return max(pos, new_pos - 1);
-}
-
-static int beyond_eof(struct inode *inode, loff_t bix)
-{
- loff_t pos = bix << inode->i_sb->s_blocksize_bits;
- return pos >= i_size_read(inode);
-}
-
-/*
- * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
- * so short names (len <= 9) don't even occupy the complete 32bit name
- * space. A prime >256 ensures short names quickly spread the 32bit
- * name space. Add about 26 for the estimated amount of information
- * of each character and pick a prime nearby, preferably a bit-sparse
- * one.
- */
-static u32 logfs_hash_32(const char *s, int len, u32 seed)
-{
- u32 hash = seed;
- int i;
-
- for (i = 0; i < len; i++)
- hash = hash * 293 + s[i];
- return hash;
-}
-
-/*
- * We have to satisfy several conflicting requirements here. Small
- * directories should stay fairly compact and not require too many
- * indirect blocks. The number of possible locations for a given hash
- * should be small to make lookup() fast. And we should try hard not
- * to overflow the 32bit name space or nfs and 32bit host systems will
- * be unhappy.
- *
- * So we use the following scheme. First we reduce the hash to 0..15
- * and try a direct block. If that is occupied we reduce the hash to
- * 16..255 and try an indirect block. Same for 2x and 3x indirect
- * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
- * but use buckets containing eight entries instead of a single one.
- *
- * Using 16 entries should allow for a reasonable amount of hash
- * collisions, so the 32bit name space can be packed fairly tight
- * before overflowing. Oh and currently we don't overflow but return
- * and error.
- *
- * How likely are collisions? Doing the appropriate math is beyond me
- * and the Bronstein textbook. But running a test program to brute
- * force collisions for a couple of days showed that on average the
- * first collision occurs after 598M entries, with 290M being the
- * smallest result. Obviously 21 entries could already cause a
- * collision if all entries are carefully chosen.
- */
-static pgoff_t hash_index(u32 hash, int round)
-{
- u32 i0_blocks = I0_BLOCKS;
- u32 i1_blocks = I1_BLOCKS;
- u32 i2_blocks = I2_BLOCKS;
- u32 i3_blocks = I3_BLOCKS;
-
- switch (round) {
- case 0:
- return hash % i0_blocks;
- case 1:
- return i0_blocks + hash % (i1_blocks - i0_blocks);
- case 2:
- return i1_blocks + hash % (i2_blocks - i1_blocks);
- case 3:
- return i2_blocks + hash % (i3_blocks - i2_blocks);
- case 4 ... 19:
- return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
- + round - 4;
- }
- BUG();
-}
-
-static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
-{
- const struct qstr *name = &dentry->d_name;
- struct page *page;
- struct logfs_disk_dentry *dd;
- u32 hash = logfs_hash_32(name->name, name->len, 0);
- pgoff_t index;
- int round;
-
- if (name->len > LOGFS_MAX_NAMELEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- for (round = 0; round < 20; round++) {
- index = hash_index(hash, round);
-
- if (beyond_eof(dir, index))
- return NULL;
- if (!logfs_exist_block(dir, index))
- continue;
- page = read_cache_page(dir->i_mapping, index,
- (filler_t *)logfs_readpage, NULL);
- if (IS_ERR(page))
- return page;
- dd = kmap_atomic(page);
- BUG_ON(dd->namelen == 0);
-
- if (name->len != be16_to_cpu(dd->namelen) ||
- memcmp(name->name, dd->name, name->len)) {
- kunmap_atomic(dd);
- put_page(page);
- continue;
- }
-
- kunmap_atomic(dd);
- return page;
- }
- return NULL;
-}
-
-static int logfs_remove_inode(struct inode *inode)
-{
- int ret;
-
- drop_nlink(inode);
- ret = write_inode(inode);
- LOGFS_BUG_ON(ret, inode->i_sb);
- return ret;
-}
-
-static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
- if (logfs_inode(inode)->li_block)
- logfs_inode(inode)->li_block->ta = NULL;
- kfree(ta);
-}
-
-static int logfs_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct logfs_super *super = logfs_super(dir->i_sb);
- struct inode *inode = d_inode(dentry);
- struct logfs_transaction *ta;
- struct page *page;
- pgoff_t index;
- int ret;
-
- ta = kzalloc(sizeof(*ta), GFP_KERNEL);
- if (!ta)
- return -ENOMEM;
-
- ta->state = UNLINK_1;
- ta->ino = inode->i_ino;
-
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
-
- page = logfs_get_dd_page(dir, dentry);
- if (!page) {
- kfree(ta);
- return -ENOENT;
- }
- if (IS_ERR(page)) {
- kfree(ta);
- return PTR_ERR(page);
- }
- index = page->index;
- put_page(page);
-
- mutex_lock(&super->s_dirop_mutex);
- logfs_add_transaction(dir, ta);
-
- ret = logfs_delete(dir, index, NULL);
- if (!ret)
- ret = write_inode(dir);
-
- if (ret) {
- abort_transaction(dir, ta);
- printk(KERN_ERR"LOGFS: unable to delete inode\n");
- goto out;
- }
-
- ta->state = UNLINK_2;
- logfs_add_transaction(inode, ta);
- ret = logfs_remove_inode(inode);
-out:
- mutex_unlock(&super->s_dirop_mutex);
- return ret;
-}
-
-static inline int logfs_empty_dir(struct inode *dir)
-{
- u64 data;
-
- data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
- return data >= i_size_read(dir);
-}
-
-static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
-
- if (!logfs_empty_dir(inode))
- return -ENOTEMPTY;
-
- return logfs_unlink(dir, dentry);
-}
-
-/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
- * way to combine the two copies */
-static int logfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct inode *dir = file_inode(file);
- loff_t pos;
- struct page *page;
- struct logfs_disk_dentry *dd;
-
- if (ctx->pos < 0)
- return -EINVAL;
-
- if (!dir_emit_dots(file, ctx))
- return 0;
-
- pos = ctx->pos - 2;
- BUG_ON(pos < 0);
- for (;; pos++, ctx->pos++) {
- bool full;
- if (beyond_eof(dir, pos))
- break;
- if (!logfs_exist_block(dir, pos)) {
- /* deleted dentry */
- pos = dir_seek_data(dir, pos);
- continue;
- }
- page = read_cache_page(dir->i_mapping, pos,
- (filler_t *)logfs_readpage, NULL);
- if (IS_ERR(page))
- return PTR_ERR(page);
- dd = kmap(page);
- BUG_ON(dd->namelen == 0);
-
- full = !dir_emit(ctx, (char *)dd->name,
- be16_to_cpu(dd->namelen),
- be64_to_cpu(dd->ino), dd->type);
- kunmap(page);
- put_page(page);
- if (full)
- break;
- }
- return 0;
-}
-
-static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name)
-{
- dd->namelen = cpu_to_be16(name->len);
- memcpy(dd->name, name->name, name->len);
-}
-
-static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct page *page;
- struct logfs_disk_dentry *dd;
- pgoff_t index;
- u64 ino = 0;
- struct inode *inode;
-
- page = logfs_get_dd_page(dir, dentry);
- if (IS_ERR(page))
- return ERR_CAST(page);
- if (!page) {
- d_add(dentry, NULL);
- return NULL;
- }
- index = page->index;
- dd = kmap_atomic(page);
- ino = be64_to_cpu(dd->ino);
- kunmap_atomic(dd);
- put_page(page);
-
- inode = logfs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
- ino, dir->i_ino, index);
- return d_splice_alias(inode, dentry);
-}
-
-static void grow_dir(struct inode *dir, loff_t index)
-{
- index = (index + 1) << dir->i_sb->s_blocksize_bits;
- if (i_size_read(dir) < index)
- i_size_write(dir, index);
-}
-
-static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
- struct inode *inode)
-{
- struct page *page;
- struct logfs_disk_dentry *dd;
- u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0);
- pgoff_t index;
- int round, err;
-
- for (round = 0; round < 20; round++) {
- index = hash_index(hash, round);
-
- if (logfs_exist_block(dir, index))
- continue;
- page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- dd = kmap_atomic(page);
- memset(dd, 0, sizeof(*dd));
- dd->ino = cpu_to_be64(inode->i_ino);
- dd->type = logfs_type(inode);
- logfs_set_name(dd, &dentry->d_name);
- kunmap_atomic(dd);
-
- err = logfs_write_buf(dir, page, WF_LOCK);
- unlock_page(page);
- put_page(page);
- if (!err)
- grow_dir(dir, index);
- return err;
- }
- /* FIXME: Is there a better return value? In most cases neither
- * the filesystem nor the directory are full. But we have had
- * too many collisions for this particular hash and no fallback.
- */
- return -ENOSPC;
-}
-
-static int __logfs_create(struct inode *dir, struct dentry *dentry,
- struct inode *inode, const char *dest, long destlen)
-{
- struct logfs_super *super = logfs_super(dir->i_sb);
- struct logfs_inode *li = logfs_inode(inode);
- struct logfs_transaction *ta;
- int ret;
-
- ta = kzalloc(sizeof(*ta), GFP_KERNEL);
- if (!ta) {
- drop_nlink(inode);
- iput(inode);
- return -ENOMEM;
- }
-
- ta->state = CREATE_1;
- ta->ino = inode->i_ino;
- mutex_lock(&super->s_dirop_mutex);
- logfs_add_transaction(inode, ta);
-
- if (dest) {
- /* symlink */
- ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
- if (!ret)
- ret = write_inode(inode);
- } else {
- /* creat/mkdir/mknod */
- ret = write_inode(inode);
- }
- if (ret) {
- abort_transaction(inode, ta);
- li->li_flags |= LOGFS_IF_STILLBORN;
- /* FIXME: truncate symlink */
- drop_nlink(inode);
- iput(inode);
- goto out;
- }
-
- ta->state = CREATE_2;
- logfs_add_transaction(dir, ta);
- ret = logfs_write_dir(dir, dentry, inode);
- /* sync directory */
- if (!ret)
- ret = write_inode(dir);
-
- if (ret) {
- logfs_del_transaction(dir, ta);
- ta->state = CREATE_2;
- logfs_add_transaction(inode, ta);
- logfs_remove_inode(inode);
- iput(inode);
- goto out;
- }
- d_instantiate(dentry, inode);
-out:
- mutex_unlock(&super->s_dirop_mutex);
- return ret;
-}
-
-static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct inode *inode;
-
- /*
- * FIXME: why do we have to fill in S_IFDIR, while the mode is
- * correct for mknod, creat, etc.? Smells like the vfs *should*
- * do it for us but for some reason fails to do so.
- */
- inode = logfs_new_inode(dir, S_IFDIR | mode);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- inode->i_op = &logfs_dir_iops;
- inode->i_fop = &logfs_dir_fops;
-
- return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
-{
- struct inode *inode;
-
- inode = logfs_new_inode(dir, mode);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- inode->i_op = &logfs_reg_iops;
- inode->i_fop = &logfs_reg_fops;
- inode->i_mapping->a_ops = &logfs_reg_aops;
-
- return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
- dev_t rdev)
-{
- struct inode *inode;
-
- if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
- return -ENAMETOOLONG;
-
- inode = logfs_new_inode(dir, mode);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- init_special_inode(inode, mode, rdev);
-
- return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_symlink(struct inode *dir, struct dentry *dentry,
- const char *target)
-{
- struct inode *inode;
- size_t destlen = strlen(target) + 1;
-
- if (destlen > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
-
- inode = logfs_new_inode(dir, S_IFLNK | 0777);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &logfs_reg_aops;
-
- return __logfs_create(dir, dentry, inode, target, destlen);
-}
-
-static int logfs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
- ihold(inode);
- inc_nlink(inode);
- mark_inode_dirty_sync(inode);
-
- return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
- struct logfs_disk_dentry *dd, loff_t *pos)
-{
- struct page *page;
- void *map;
-
- page = logfs_get_dd_page(dir, dentry);
- if (IS_ERR(page))
- return PTR_ERR(page);
- *pos = page->index;
- map = kmap_atomic(page);
- memcpy(dd, map, sizeof(*dd));
- kunmap_atomic(map);
- put_page(page);
- return 0;
-}
-
-static int logfs_delete_dd(struct inode *dir, loff_t pos)
-{
- /*
- * Getting called with pos somewhere beyond eof is either a goofup
- * within this file or means someone maliciously edited the
- * (crc-protected) journal.
- */
- BUG_ON(beyond_eof(dir, pos));
- dir->i_ctime = dir->i_mtime = current_time(dir);
- log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
- return logfs_delete(dir, pos, NULL);
-}
-
-/*
- * Cross-directory rename, target does not exist. Just a little nasty.
- * Create a new dentry in the target dir, then remove the old dentry,
- * all the while taking care to remember our operation in the journal.
- */
-static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct logfs_super *super = logfs_super(old_dir->i_sb);
- struct logfs_disk_dentry dd;
- struct logfs_transaction *ta;
- loff_t pos;
- int err;
-
- /* 1. locate source dd */
- err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
- if (err)
- return err;
-
- ta = kzalloc(sizeof(*ta), GFP_KERNEL);
- if (!ta)
- return -ENOMEM;
-
- ta->state = CROSS_RENAME_1;
- ta->dir = old_dir->i_ino;
- ta->pos = pos;
-
- /* 2. write target dd */
- mutex_lock(&super->s_dirop_mutex);
- logfs_add_transaction(new_dir, ta);
- err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry));
- if (!err)
- err = write_inode(new_dir);
-
- if (err) {
- super->s_rename_dir = 0;
- super->s_rename_pos = 0;
- abort_transaction(new_dir, ta);
- goto out;
- }
-
- /* 3. remove source dd */
- ta->state = CROSS_RENAME_2;
- logfs_add_transaction(old_dir, ta);
- err = logfs_delete_dd(old_dir, pos);
- if (!err)
- err = write_inode(old_dir);
- LOGFS_BUG_ON(err, old_dir->i_sb);
-out:
- mutex_unlock(&super->s_dirop_mutex);
- return err;
-}
-
-static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
- struct logfs_disk_dentry *dd, struct inode *inode)
-{
- loff_t pos;
- int err;
-
- err = logfs_get_dd(dir, dentry, dd, &pos);
- if (err)
- return err;
- dd->ino = cpu_to_be64(inode->i_ino);
- dd->type = logfs_type(inode);
-
- err = write_dir(dir, dd, pos);
- if (err)
- return err;
- log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
- dd->name, be64_to_cpu(dd->ino));
- return write_inode(dir);
-}
-
-/* Target dentry exists - the worst case. We need to attach the source
- * inode to the target dentry, then remove the orphaned target inode and
- * source dentry.
- */
-static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct logfs_super *super = logfs_super(old_dir->i_sb);
- struct inode *old_inode = d_inode(old_dentry);
- struct inode *new_inode = d_inode(new_dentry);
- int isdir = S_ISDIR(old_inode->i_mode);
- struct logfs_disk_dentry dd;
- struct logfs_transaction *ta;
- loff_t pos;
- int err;
-
- BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
- if (isdir) {
- if (!logfs_empty_dir(new_inode))
- return -ENOTEMPTY;
- }
-
- /* 1. locate source dd */
- err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
- if (err)
- return err;
-
- ta = kzalloc(sizeof(*ta), GFP_KERNEL);
- if (!ta)
- return -ENOMEM;
-
- ta->state = TARGET_RENAME_1;
- ta->dir = old_dir->i_ino;
- ta->pos = pos;
- ta->ino = new_inode->i_ino;
-
- /* 2. attach source inode to target dd */
- mutex_lock(&super->s_dirop_mutex);
- logfs_add_transaction(new_dir, ta);
- err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
- if (err) {
- super->s_rename_dir = 0;
- super->s_rename_pos = 0;
- super->s_victim_ino = 0;
- abort_transaction(new_dir, ta);
- goto out;
- }
-
- /* 3. remove source dd */
- ta->state = TARGET_RENAME_2;
- logfs_add_transaction(old_dir, ta);
- err = logfs_delete_dd(old_dir, pos);
- if (!err)
- err = write_inode(old_dir);
- LOGFS_BUG_ON(err, old_dir->i_sb);
-
- /* 4. remove target inode */
- ta->state = TARGET_RENAME_3;
- logfs_add_transaction(new_inode, ta);
- err = logfs_remove_inode(new_inode);
-
-out:
- mutex_unlock(&super->s_dirop_mutex);
- return err;
-}
-
-static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- if (d_really_is_positive(new_dentry))
- return logfs_rename_target(old_dir, old_dentry,
- new_dir, new_dentry);
- return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
-}
-
-/* No locking done here, as this is called before .get_sb() returns. */
-int logfs_replay_journal(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode;
- u64 ino, pos;
- int err;
-
- if (super->s_victim_ino) {
- /* delete victim inode */
- ino = super->s_victim_ino;
- printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
- inode = logfs_iget(sb, ino);
- if (IS_ERR(inode))
- goto fail;
-
- LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
- super->s_victim_ino = 0;
- err = logfs_remove_inode(inode);
- iput(inode);
- if (err) {
- super->s_victim_ino = ino;
- goto fail;
- }
- }
- if (super->s_rename_dir) {
- /* delete old dd from rename */
- ino = super->s_rename_dir;
- pos = super->s_rename_pos;
- printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
- ino, pos);
- inode = logfs_iget(sb, ino);
- if (IS_ERR(inode))
- goto fail;
-
- super->s_rename_dir = 0;
- super->s_rename_pos = 0;
- err = logfs_delete_dd(inode, pos);
- iput(inode);
- if (err) {
- super->s_rename_dir = ino;
- super->s_rename_pos = pos;
- goto fail;
- }
- }
- return 0;
-fail:
- LOGFS_BUG(sb);
- return -EIO;
-}
-
-const struct inode_operations logfs_dir_iops = {
- .create = logfs_create,
- .link = logfs_link,
- .lookup = logfs_lookup,
- .mkdir = logfs_mkdir,
- .mknod = logfs_mknod,
- .rename = logfs_rename,
- .rmdir = logfs_rmdir,
- .symlink = logfs_symlink,
- .unlink = logfs_unlink,
-};
-const struct file_operations logfs_dir_fops = {
- .fsync = logfs_fsync,
- .unlocked_ioctl = logfs_ioctl,
- .iterate_shared = logfs_readdir,
- .read = generic_read_dir,
- .llseek = generic_file_llseek,
-};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
deleted file mode 100644
index 1db04930ad57..000000000000
--- a/fs/logfs/file.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * fs/logfs/file.c - prepare_write, commit_write and friends
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/writeback.h>
-
-static int logfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct inode *inode = mapping->host;
- struct page *page;
- pgoff_t index = pos >> PAGE_SHIFT;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
- if ((len == PAGE_SIZE) || PageUptodate(page))
- return 0;
- if ((pos & PAGE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_SIZE - 1);
- unsigned end = start + len;
-
- /* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_SIZE);
- return 0;
- }
- return logfs_readpage_nolock(page);
-}
-
-static int logfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct page *page,
- void *fsdata)
-{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
- unsigned start = pos & (PAGE_SIZE - 1);
- unsigned end = start + copied;
- int ret = 0;
-
- BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
- BUG_ON(page->index > I3_BLOCKS);
-
- if (copied < len) {
- /*
- * Short write of a non-initialized paged. Just tell userspace
- * to retry the entire page.
- */
- if (!PageUptodate(page)) {
- copied = 0;
- goto out;
- }
- }
- if (copied == 0)
- goto out; /* FIXME: do we need to update inode? */
-
- if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
- i_size_write(inode, (index << PAGE_SHIFT) + end);
- mark_inode_dirty_sync(inode);
- }
-
- SetPageUptodate(page);
- if (!PageDirty(page)) {
- if (!get_page_reserve(inode, page))
- __set_page_dirty_nobuffers(page);
- else
- ret = logfs_write_buf(inode, page, WF_LOCK);
- }
-out:
- unlock_page(page);
- put_page(page);
- return ret ? ret : copied;
-}
-
-int logfs_readpage(struct file *file, struct page *page)
-{
- int ret;
-
- ret = logfs_readpage_nolock(page);
- unlock_page(page);
- return ret;
-}
-
-/* Clear the page's dirty flag in the radix tree. */
-/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
- * the dirty bit from the radix tree for filesystems that don't have to wait
- * for page writeback to finish (i.e. any compressing filesystem).
- */
-static void clear_radix_tree_dirty(struct page *page)
-{
- BUG_ON(PagePrivate(page) || page->private);
- set_page_writeback(page);
- end_page_writeback(page);
-}
-
-static int __logfs_writepage(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- int err;
-
- err = logfs_write_buf(inode, page, WF_LOCK);
- if (err)
- set_page_dirty(page);
- else
- clear_radix_tree_dirty(page);
- unlock_page(page);
- return err;
-}
-
-static int logfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned offset;
- u64 bix;
- level_t level;
-
- log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
- page);
-
- logfs_unpack_index(page->index, &bix, &level);
-
- /* Indirect blocks are never truncated */
- if (level != 0)
- return __logfs_writepage(page);
-
- /*
- * TODO: everything below is a near-verbatim copy of nobh_writepage().
- * The relevant bits should be factored out after logfs is merged.
- */
-
- /* Is the page fully inside i_size? */
- if (bix < end_index)
- return __logfs_writepage(page);
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_SIZE-1);
- if (bix > end_index || offset == 0) {
- unlock_page(page);
- return 0; /* don't care */
- }
-
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invokation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- zero_user_segment(page, offset, PAGE_SIZE);
- return __logfs_writepage(page);
-}
-
-static void logfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- struct logfs_block *block = logfs_block(page);
-
- if (block->reserved_bytes) {
- struct super_block *sb = page->mapping->host->i_sb;
- struct logfs_super *super = logfs_super(sb);
-
- super->s_dirty_pages -= block->reserved_bytes;
- block->ops->free_block(sb, block);
- BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
- } else
- move_page_to_btree(page);
- BUG_ON(PagePrivate(page) || page->private);
-}
-
-static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
-{
- return 0; /* None of these are easy to release */
-}
-
-
-long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- struct inode *inode = file_inode(file);
- struct logfs_inode *li = logfs_inode(inode);
- unsigned int oldflags, flags;
- int err;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
- return put_user(flags, (int __user *)arg);
- case FS_IOC_SETFLAGS:
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!inode_owner_or_capable(inode))
- return -EACCES;
-
- err = get_user(flags, (int __user *)arg);
- if (err)
- return err;
-
- inode_lock(inode);
- oldflags = li->li_flags;
- flags &= LOGFS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
- li->li_flags = flags;
- inode_unlock(inode);
-
- inode->i_ctime = current_time(inode);
- mark_inode_dirty_sync(inode);
- return 0;
-
- default:
- return -ENOTTY;
- }
-}
-
-int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct super_block *sb = file->f_mapping->host->i_sb;
- struct inode *inode = file->f_mapping->host;
- int ret;
-
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
-
- inode_lock(inode);
- logfs_get_wblocks(sb, NULL, WF_LOCK);
- logfs_write_anchor(sb);
- logfs_put_wblocks(sb, NULL, WF_LOCK);
- inode_unlock(inode);
-
- return 0;
-}
-
-static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- int err = 0;
-
- err = setattr_prepare(dentry, attr);
- if (err)
- return err;
-
- if (attr->ia_valid & ATTR_SIZE) {
- err = logfs_truncate(inode, attr->ia_size);
- if (err)
- return err;
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations logfs_reg_iops = {
- .setattr = logfs_setattr,
-};
-
-const struct file_operations logfs_reg_fops = {
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .fsync = logfs_fsync,
- .unlocked_ioctl = logfs_ioctl,
- .llseek = generic_file_llseek,
- .mmap = generic_file_readonly_mmap,
- .open = generic_file_open,
-};
-
-const struct address_space_operations logfs_reg_aops = {
- .invalidatepage = logfs_invalidatepage,
- .readpage = logfs_readpage,
- .releasepage = logfs_releasepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
- .writepage = logfs_writepage,
- .writepages = generic_writepages,
- .write_begin = logfs_write_begin,
- .write_end = logfs_write_end,
-};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
deleted file mode 100644
index d4efb061bdc5..000000000000
--- a/fs/logfs/gc.c
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * fs/logfs/gc.c - garbage collection code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-/*
- * Wear leveling needs to kick in when the difference between low erase
- * counts and high erase counts gets too big. A good value for "too big"
- * may be somewhat below 10% of maximum erase count for the device.
- * Why not 397, to pick a nice round number with no specific meaning? :)
- *
- * WL_RATELIMIT is the minimum time between two wear level events. A huge
- * number of segments may fulfil the requirements for wear leveling at the
- * same time. If that happens we don't want to cause a latency from hell,
- * but just gently pick one segment every so often and minimize overhead.
- */
-#define WL_DELTA 397
-#define WL_RATELIMIT 100
-#define MAX_OBJ_ALIASES 2600
-#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
-#define LIST_SIZE 64 /* base size of candidate lists */
-#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
-#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
-
-static int no_free_segments(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- return super->s_free_list.count;
-}
-
-/* journal has distance -1, top-most ifile layer distance 0 */
-static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
-{
- struct logfs_super *super = logfs_super(sb);
- u8 gc_level = (__force u8)__gc_level;
-
- switch (gc_level) {
- case 0: /* fall through */
- case 1: /* fall through */
- case 2: /* fall through */
- case 3:
- /* file data or indirect blocks */
- return super->s_ifile_levels + super->s_iblock_levels - gc_level;
- case 6: /* fall through */
- case 7: /* fall through */
- case 8: /* fall through */
- case 9:
- /* inode file data or indirect blocks */
- return super->s_ifile_levels - (gc_level - 6);
- default:
- printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
- gc_level);
- WARN_ON(1);
- return super->s_ifile_levels + super->s_iblock_levels;
- }
-}
-
-static int segment_is_reserved(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area;
- void *reserved;
- int i;
-
- /* Some segments are reserved. Just pretend they were all valid */
- reserved = btree_lookup32(&super->s_reserved_segments, segno);
- if (reserved)
- return 1;
-
- /* Currently open segments */
- for_each_area(i) {
- area = super->s_area[i];
- if (area->a_is_open && area->a_segno == segno)
- return 1;
- }
-
- return 0;
-}
-
-static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
-{
- BUG();
-}
-
-/*
- * Returns the bytes consumed by valid objects in this segment. Object headers
- * are counted, the segment header is not.
- */
-static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
- gc_level_t *gc_level)
-{
- struct logfs_segment_entry se;
- u32 ec_level;
-
- logfs_get_segment_entry(sb, segno, &se);
- if (se.ec_level == cpu_to_be32(BADSEG) ||
- se.valid == cpu_to_be32(RESERVED))
- return RESERVED;
-
- ec_level = be32_to_cpu(se.ec_level);
- *ec = ec_level >> 4;
- *gc_level = GC_LEVEL(ec_level & 0xf);
- return be32_to_cpu(se.valid);
-}
-
-static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
- u64 bix, gc_level_t gc_level)
-{
- struct inode *inode;
- int err, cookie;
-
- inode = logfs_safe_iget(sb, ino, &cookie);
- err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
- BUG_ON(err);
- logfs_safe_iput(inode, cookie);
-}
-
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_segment_header sh;
- struct logfs_object_header oh;
- u64 ofs, ino, bix;
- u32 seg_ofs, logical_segno, cleaned = 0;
- int err, len, valid;
- gc_level_t gc_level;
-
- LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
-
- btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
- err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
- BUG_ON(err);
- gc_level = GC_LEVEL(sh.level);
- logical_segno = be32_to_cpu(sh.segno);
- if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
- logfs_mark_segment_bad(sb, segno);
- cleaned = -1;
- goto out;
- }
-
- for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
- seg_ofs + sizeof(oh) < super->s_segsize; ) {
- ofs = dev_ofs(sb, logical_segno, seg_ofs);
- err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
- &oh);
- BUG_ON(err);
-
- if (!memchr_inv(&oh, 0xff, sizeof(oh)))
- break;
-
- if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
- logfs_mark_segment_bad(sb, segno);
- cleaned = super->s_segsize - 1;
- goto out;
- }
-
- ino = be64_to_cpu(oh.ino);
- bix = be64_to_cpu(oh.bix);
- len = sizeof(oh) + be16_to_cpu(oh.len);
- valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
- if (valid == 1) {
- logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
- cleaned += len;
- } else if (valid == 2) {
- /* Will be invalid upon journal commit */
- cleaned += len;
- }
- seg_ofs += len;
- }
-out:
- btree_remove32(&super->s_reserved_segments, segno);
- return cleaned;
-}
-
-static struct gc_candidate *add_list(struct gc_candidate *cand,
- struct candidate_list *list)
-{
- struct rb_node **p = &list->rb_tree.rb_node;
- struct rb_node *parent = NULL;
- struct gc_candidate *cur;
- int comp;
-
- cand->list = list;
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct gc_candidate, rb_node);
-
- if (list->sort_by_ec)
- comp = cand->erase_count < cur->erase_count;
- else
- comp = cand->valid < cur->valid;
-
- if (comp)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
- }
- rb_link_node(&cand->rb_node, parent, p);
- rb_insert_color(&cand->rb_node, &list->rb_tree);
-
- if (list->count <= list->maxcount) {
- list->count++;
- return NULL;
- }
- cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
- rb_erase(&cand->rb_node, &list->rb_tree);
- cand->list = NULL;
- return cand;
-}
-
-static void remove_from_list(struct gc_candidate *cand)
-{
- struct candidate_list *list = cand->list;
-
- rb_erase(&cand->rb_node, &list->rb_tree);
- list->count--;
-}
-
-static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
-
- btree_remove32(&super->s_cand_tree, cand->segno);
- kfree(cand);
-}
-
-u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
-{
- struct gc_candidate *cand;
- u32 segno;
-
- BUG_ON(list->count == 0);
-
- cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
- remove_from_list(cand);
- segno = cand->segno;
- if (ec)
- *ec = cand->erase_count;
- free_candidate(sb, cand);
- return segno;
-}
-
-/*
- * We have several lists to manage segments with. The reserve_list is used to
- * deal with bad blocks. We try to keep the best (lowest ec) segments on this
- * list.
- * The free_list contains free segments for normal usage. It usually gets the
- * second pick after the reserve_list. But when the free_list is running short
- * it is more important to keep the free_list full than to keep a reserve.
- *
- * Segments that are not free are put onto a per-level low_list. If we have
- * to run garbage collection, we pick a candidate from there. All segments on
- * those lists should have at least some free space so GC will make progress.
- *
- * And last we have the ec_list, which is used to pick segments for wear
- * leveling.
- *
- * If all appropriate lists are full, we simply free the candidate and forget
- * about that segment for a while. We have better candidates for each purpose.
- */
-static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
- u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
-
- if (cand->valid == 0) {
- /* 100% free segments */
- log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
- cand->segno, cand->erase_count,
- dev_ofs(sb, cand->segno, 0));
- cand = add_list(cand, &super->s_reserve_list);
- if (cand) {
- log_gc_noisy("add free segment %x (ec %x) at %llx\n",
- cand->segno, cand->erase_count,
- dev_ofs(sb, cand->segno, 0));
- cand = add_list(cand, &super->s_free_list);
- }
- } else {
- /* good candidates for Garbage Collection */
- if (cand->valid < full)
- cand = add_list(cand, &super->s_low_list[cand->dist]);
- /* good candidates for wear leveling,
- * segments that were recently written get ignored */
- if (cand)
- cand = add_list(cand, &super->s_ec_list);
- }
- if (cand)
- free_candidate(sb, cand);
-}
-
-static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
- u8 dist)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
-
- cand = kmalloc(sizeof(*cand), GFP_NOFS);
- if (!cand)
- return -ENOMEM;
-
- cand->segno = segno;
- cand->valid = valid;
- cand->erase_count = ec;
- cand->dist = dist;
-
- btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
- __add_candidate(sb, cand);
- return 0;
-}
-
-static void remove_segment_from_lists(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
-
- cand = btree_lookup32(&super->s_cand_tree, segno);
- if (cand) {
- remove_from_list(cand);
- free_candidate(sb, cand);
- }
-}
-
-static void scan_segment(struct super_block *sb, u32 segno)
-{
- u32 valid, ec = 0;
- gc_level_t gc_level = 0;
- u8 dist;
-
- if (segment_is_reserved(sb, segno))
- return;
-
- remove_segment_from_lists(sb, segno);
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- if (valid == RESERVED)
- return;
-
- dist = root_distance(sb, gc_level);
- add_candidate(sb, segno, valid, ec, dist);
-}
-
-static struct gc_candidate *first_in_list(struct candidate_list *list)
-{
- if (list->count == 0)
- return NULL;
- return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
-}
-
-/*
- * Find the best segment for garbage collection. Main criterion is
- * the segment requiring the least effort to clean. Secondary
- * criterion is to GC on the lowest level available.
- *
- * So we search the least effort segment on the lowest level first,
- * then move up and pick another segment iff is requires significantly
- * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
- */
-static struct gc_candidate *get_candidate(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i, max_dist;
- struct gc_candidate *cand = NULL, *this;
-
- max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
-
- for (i = max_dist; i >= 0; i--) {
- this = first_in_list(&super->s_low_list[i]);
- if (!this)
- continue;
- if (!cand)
- cand = this;
- if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
- cand = this;
- }
- return cand;
-}
-
-static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
- gc_level_t gc_level;
- u32 cleaned, valid, segno, ec;
- u8 dist;
-
- if (!cand) {
- log_gc("GC attempted, but no candidate found\n");
- return 0;
- }
-
- segno = cand->segno;
- dist = cand->dist;
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- free_candidate(sb, cand);
- log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
- segno, (u64)segno << super->s_segshift,
- dist, no_free_segments(sb), valid,
- super->s_free_bytes);
- cleaned = logfs_gc_segment(sb, segno);
- log_gc("GC segment #%02x complete - now %x valid\n", segno,
- valid - cleaned);
- BUG_ON(cleaned != valid);
- return 1;
-}
-
-static int logfs_gc_once(struct super_block *sb)
-{
- struct gc_candidate *cand;
-
- cand = get_candidate(sb);
- if (cand)
- remove_from_list(cand);
- return __logfs_gc_once(sb, cand);
-}
-
-/* returns 1 if a wrap occurs, 0 otherwise */
-static int logfs_scan_some(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- u32 segno;
- int i, ret = 0;
-
- segno = super->s_sweeper;
- for (i = SCAN_RATIO; i > 0; i--) {
- segno++;
- if (segno >= super->s_no_segs) {
- segno = 0;
- ret = 1;
- /* Break out of the loop. We want to read a single
- * block from the segment size on next invocation if
- * SCAN_RATIO is set to match block size
- */
- break;
- }
-
- scan_segment(sb, segno);
- }
- super->s_sweeper = segno;
- return ret;
-}
-
-/*
- * In principle, this function should loop forever, looking for GC candidates
- * and moving data. LogFS is designed in such a way that this loop is
- * guaranteed to terminate.
- *
- * Limiting the loop to some iterations serves purely to catch cases when
- * these guarantees have failed. An actual endless loop is an obvious bug
- * and should be reported as such.
- */
-static void __logfs_gc_pass(struct super_block *sb, int target)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_block *block;
- int round, progress, last_progress = 0;
-
- /*
- * Doing too many changes to the segfile at once would result
- * in a large number of aliases. Write the journal before
- * things get out of hand.
- */
- if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
- logfs_write_anchor(sb);
-
- if (no_free_segments(sb) >= target &&
- super->s_no_object_aliases < MAX_OBJ_ALIASES)
- return;
-
- log_gc("__logfs_gc_pass(%x)\n", target);
- for (round = 0; round < SCAN_ROUNDS; ) {
- if (no_free_segments(sb) >= target)
- goto write_alias;
-
- /* Sync in-memory state with on-medium state in case they
- * diverged */
- logfs_write_anchor(sb);
- round += logfs_scan_some(sb);
- if (no_free_segments(sb) >= target)
- goto write_alias;
- progress = logfs_gc_once(sb);
- if (progress)
- last_progress = round;
- else if (round - last_progress > 2)
- break;
- continue;
-
- /*
- * The goto logic is nasty, I just don't know a better way to
- * code it. GC is supposed to ensure two things:
- * 1. Enough free segments are available.
- * 2. The number of aliases is bounded.
- * When 1. is achieved, we take a look at 2. and write back
- * some alias-containing blocks, if necessary. However, after
- * each such write we need to go back to 1., as writes can
- * consume free segments.
- */
-write_alias:
- if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
- return;
- if (list_empty(&super->s_object_alias)) {
- /* All aliases are still in btree */
- return;
- }
- log_gc("Write back one alias\n");
- block = list_entry(super->s_object_alias.next,
- struct logfs_block, alias_list);
- block->ops->write_block(block);
- /*
- * To round off the nasty goto logic, we reset round here. It
- * is a safety-net for GC not making any progress and limited
- * to something reasonably small. If incremented it for every
- * single alias, the loop could terminate rather quickly.
- */
- round = 0;
- }
- LOGFS_BUG(sb);
-}
-
-static int wl_ratelimit(struct super_block *sb, u64 *next_event)
-{
- struct logfs_super *super = logfs_super(sb);
-
- if (*next_event < super->s_gec) {
- *next_event = super->s_gec + WL_RATELIMIT;
- return 0;
- }
- return 1;
-}
-
-static void logfs_wl_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *wl_cand, *free_cand;
-
- if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
- return;
-
- wl_cand = first_in_list(&super->s_ec_list);
- if (!wl_cand)
- return;
- free_cand = first_in_list(&super->s_free_list);
- if (!free_cand)
- return;
-
- if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
- remove_from_list(wl_cand);
- __logfs_gc_once(sb, wl_cand);
- }
-}
-
-/*
- * The journal needs wear leveling as well. But moving the journal is an
- * expensive operation so we try to avoid it as much as possible. And if we
- * have to do it, we move the whole journal, not individual segments.
- *
- * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
- * calculations. First we check whether moving the journal would be a
- * significant improvement. That means that a) the current journal segments
- * have more wear than the future journal segments and b) the current journal
- * segments have more wear than normal ostore segments.
- * Rationale for b) is that we don't have to move the journal if it is aging
- * less than the ostore, even if the reserve segments age even less (they are
- * excluded from wear leveling, after all).
- * Next we check that the superblocks have less wear than the journal. Since
- * moving the journal requires writing the superblocks, we have to protect the
- * superblocks even more than the journal.
- *
- * Also we double the acceptable wear difference, compared to ostore wear
- * leveling. Journal data is read and rewritten rapidly, comparatively. So
- * soft errors have much less time to accumulate and we allow the journal to
- * be a bit worse than the ostore.
- */
-static void logfs_journal_wl_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
- u32 min_journal_ec = -1, max_reserve_ec = 0;
- int i;
-
- if (wl_ratelimit(sb, &super->s_wl_gec_journal))
- return;
-
- if (super->s_reserve_list.count < super->s_no_journal_segs) {
- /* Reserve is not full enough to move complete journal */
- return;
- }
-
- journal_for_each(i)
- if (super->s_journal_seg[i])
- min_journal_ec = min(min_journal_ec,
- super->s_journal_ec[i]);
- cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
- struct gc_candidate, rb_node);
- max_reserve_ec = cand->erase_count;
- for (i = 0; i < 2; i++) {
- struct logfs_segment_entry se;
- u32 segno = seg_no(sb, super->s_sb_ofs[i]);
- u32 ec;
-
- logfs_get_segment_entry(sb, segno, &se);
- ec = be32_to_cpu(se.ec_level) >> 4;
- max_reserve_ec = max(max_reserve_ec, ec);
- }
-
- if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
- do_logfs_journal_wl_pass(sb);
- }
-}
-
-void logfs_gc_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
- /* Write journal before free space is getting saturated with dirty
- * objects.
- */
- if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
- + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
- logfs_write_anchor(sb);
- __logfs_gc_pass(sb, super->s_total_levels);
- logfs_wl_pass(sb);
- logfs_journal_wl_pass(sb);
-}
-
-static int check_area(struct super_block *sb, int i)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_area[i];
- gc_level_t gc_level;
- u32 cleaned, valid, ec;
- u32 segno = area->a_segno;
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-
- if (!area->a_is_open)
- return 0;
-
- if (super->s_devops->can_write_buf(sb, ofs) == 0)
- return 0;
-
- printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
- /*
- * The device cannot write back the write buffer. Most likely the
- * wbuf was already written out and the system crashed at some point
- * before the journal commit happened. In that case we wouldn't have
- * to do anything. But if the crash happened before the wbuf was
- * written out correctly, we must GC this segment. So assume the
- * worst and always do the GC run.
- */
- area->a_is_open = 0;
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- cleaned = logfs_gc_segment(sb, segno);
- if (cleaned != valid)
- return -EIO;
- return 0;
-}
-
-int logfs_check_areas(struct super_block *sb)
-{
- int i, err;
-
- for_each_area(i) {
- err = check_area(sb, i);
- if (err)
- return err;
- }
- return 0;
-}
-
-static void logfs_init_candlist(struct candidate_list *list, int maxcount,
- int sort_by_ec)
-{
- list->count = 0;
- list->maxcount = maxcount;
- list->sort_by_ec = sort_by_ec;
- list->rb_tree = RB_ROOT;
-}
-
-int logfs_init_gc(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
- logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
- logfs_init_candlist(&super->s_reserve_list,
- super->s_bad_seg_reserve, 1);
- for_each_area(i)
- logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
- logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
- return 0;
-}
-
-static void logfs_cleanup_list(struct super_block *sb,
- struct candidate_list *list)
-{
- struct gc_candidate *cand;
-
- while (list->count) {
- cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
- rb_node);
- remove_from_list(cand);
- free_candidate(sb, cand);
- }
- BUG_ON(list->rb_tree.rb_node);
-}
-
-void logfs_cleanup_gc(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- if (!super->s_free_list.count)
- return;
-
- /*
- * FIXME: The btree may still contain a single empty node. So we
- * call the grim visitor to clean up that mess. Btree code should
- * do it for us, really.
- */
- btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
- logfs_cleanup_list(sb, &super->s_free_list);
- logfs_cleanup_list(sb, &super->s_reserve_list);
- for_each_area(i)
- logfs_cleanup_list(sb, &super->s_low_list[i]);
- logfs_cleanup_list(sb, &super->s_ec_list);
-}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
deleted file mode 100644
index f440a1525da8..000000000000
--- a/fs/logfs/inode.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * fs/logfs/inode.c - inode handling code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/backing-dev.h>
-
-/*
- * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
- * on the medium. It therefore also lacks a method to store the previous
- * generation number for deleted inodes. Instead a single generation number
- * is stored which will be used for new inodes. Being just a 32bit counter,
- * this can obvious wrap relatively quickly. So we only reuse inodes if we
- * know that a fair number of inodes can be created before we have to increment
- * the generation again - effectively adding some bits to the counter.
- * But being too aggressive here means we keep a very large and very sparse
- * inode file, wasting space on indirect blocks.
- * So what is a good value? Beats me. 64k seems moderately bad on both
- * fronts, so let's use that for now...
- *
- * NFS sucks, as everyone already knows.
- */
-#define INOS_PER_WRAP (0x10000)
-
-/*
- * Logfs' requirement to read inodes for garbage collection makes life a bit
- * harder. GC may have to read inodes that are in I_FREEING state, when they
- * are being written out - and waiting for GC to make progress, naturally.
- *
- * So we cannot just call iget() or some variant of it, but first have to check
- * whether the inode in question might be in I_FREEING state. Therefore we
- * maintain our own per-sb list of "almost deleted" inodes and check against
- * that list first. Normally this should be at most 1-2 entries long.
- *
- * Also, inodes have logfs-specific reference counting on top of what the vfs
- * does. When .destroy_inode is called, normally the reference count will drop
- * to zero and the inode gets deleted. But if GC accessed the inode, its
- * refcount will remain nonzero and final deletion will have to wait.
- *
- * As a result we have two sets of functions to get/put inodes:
- * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
- * logfs_iget/iput - normal version
- */
-static struct kmem_cache *logfs_inode_cache;
-
-static DEFINE_SPINLOCK(logfs_inode_lock);
-
-static void logfs_inode_setops(struct inode *inode)
-{
- switch (inode->i_mode & S_IFMT) {
- case S_IFDIR:
- inode->i_op = &logfs_dir_iops;
- inode->i_fop = &logfs_dir_fops;
- inode->i_mapping->a_ops = &logfs_reg_aops;
- break;
- case S_IFREG:
- inode->i_op = &logfs_reg_iops;
- inode->i_fop = &logfs_reg_fops;
- inode->i_mapping->a_ops = &logfs_reg_aops;
- break;
- case S_IFLNK:
- inode->i_op = &page_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &logfs_reg_aops;
- break;
- case S_IFSOCK: /* fall through */
- case S_IFBLK: /* fall through */
- case S_IFCHR: /* fall through */
- case S_IFIFO:
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- break;
- default:
- BUG();
- }
-}
-
-static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
-{
- struct inode *inode = iget_locked(sb, ino);
- int err;
-
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- err = logfs_read_inode(inode);
- if (err || inode->i_nlink == 0) {
- /* inode->i_nlink == 0 can be true when called from
- * block validator */
- /* set i_nlink to 0 to prevent caching */
- clear_nlink(inode);
- logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
- iget_failed(inode);
- if (!err)
- err = -ENOENT;
- return ERR_PTR(err);
- }
-
- logfs_inode_setops(inode);
- unlock_new_inode(inode);
- return inode;
-}
-
-struct inode *logfs_iget(struct super_block *sb, ino_t ino)
-{
- BUG_ON(ino == LOGFS_INO_MASTER);
- BUG_ON(ino == LOGFS_INO_SEGFILE);
- return __logfs_iget(sb, ino);
-}
-
-/*
- * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
- * this allows logfs_iput to do the right thing later
- */
-struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_inode *li;
-
- if (ino == LOGFS_INO_MASTER)
- return super->s_master_inode;
- if (ino == LOGFS_INO_SEGFILE)
- return super->s_segfile_inode;
-
- spin_lock(&logfs_inode_lock);
- list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
- if (li->vfs_inode.i_ino == ino) {
- li->li_refcount++;
- spin_unlock(&logfs_inode_lock);
- *is_cached = 1;
- return &li->vfs_inode;
- }
- spin_unlock(&logfs_inode_lock);
-
- *is_cached = 0;
- return __logfs_iget(sb, ino);
-}
-
-static void logfs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
-}
-
-static void __logfs_destroy_inode(struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- BUG_ON(li->li_block);
- list_del(&li->li_freeing_list);
- call_rcu(&inode->i_rcu, logfs_i_callback);
-}
-
-static void __logfs_destroy_meta_inode(struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- BUG_ON(li->li_block);
- call_rcu(&inode->i_rcu, logfs_i_callback);
-}
-
-static void logfs_destroy_inode(struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if (inode->i_ino < LOGFS_RESERVED_INOS) {
- /*
- * The reserved inodes are never destroyed unless we are in
- * unmont path.
- */
- __logfs_destroy_meta_inode(inode);
- return;
- }
-
- BUG_ON(list_empty(&li->li_freeing_list));
- spin_lock(&logfs_inode_lock);
- li->li_refcount--;
- if (li->li_refcount == 0)
- __logfs_destroy_inode(inode);
- spin_unlock(&logfs_inode_lock);
-}
-
-void logfs_safe_iput(struct inode *inode, int is_cached)
-{
- if (inode->i_ino == LOGFS_INO_MASTER)
- return;
- if (inode->i_ino == LOGFS_INO_SEGFILE)
- return;
-
- if (is_cached) {
- logfs_destroy_inode(inode);
- return;
- }
-
- iput(inode);
-}
-
-static void logfs_init_inode(struct super_block *sb, struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- int i;
-
- li->li_flags = 0;
- li->li_height = 0;
- li->li_used_bytes = 0;
- li->li_block = NULL;
- i_uid_write(inode, 0);
- i_gid_write(inode, 0);
- inode->i_size = 0;
- inode->i_blocks = 0;
- inode->i_ctime = current_time(inode);
- inode->i_mtime = current_time(inode);
- li->li_refcount = 1;
- INIT_LIST_HEAD(&li->li_freeing_list);
-
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- li->li_data[i] = 0;
-
- return;
-}
-
-static struct inode *logfs_alloc_inode(struct super_block *sb)
-{
- struct logfs_inode *li;
-
- li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
- if (!li)
- return NULL;
- logfs_init_inode(sb, &li->vfs_inode);
- return &li->vfs_inode;
-}
-
-/*
- * In logfs inodes are written to an inode file. The inode file, like any
- * other file, is managed with a inode. The inode file's inode, aka master
- * inode, requires special handling in several respects. First, it cannot be
- * written to the inode file, so it is stored in the journal instead.
- *
- * Secondly, this inode cannot be written back and destroyed before all other
- * inodes have been written. The ordering is important. Linux' VFS is happily
- * unaware of the ordering constraint and would ordinarily destroy the master
- * inode at umount time while other inodes are still in use and dirty. Not
- * good.
- *
- * So logfs makes sure the master inode is not written until all other inodes
- * have been destroyed. Sadly, this method has another side-effect. The VFS
- * will notice one remaining inode and print a frightening warning message.
- * Worse, it is impossible to judge whether such a warning was caused by the
- * master inode or any other inodes have leaked as well.
- *
- * Our attempt of solving this is with logfs_new_meta_inode() below. Its
- * purpose is to create a new inode that will not trigger the warning if such
- * an inode is still in use. An ugly hack, no doubt. Suggections for
- * improvement are welcome.
- *
- * AV: that's what ->put_super() is for...
- */
-struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
-{
- struct inode *inode;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- inode->i_mode = S_IFREG;
- inode->i_ino = ino;
- inode->i_data.a_ops = &logfs_reg_aops;
- mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
-
- return inode;
-}
-
-struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
-{
- struct inode *inode;
- int err;
-
- inode = logfs_new_meta_inode(sb, ino);
- if (IS_ERR(inode))
- return inode;
-
- err = logfs_read_inode(inode);
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- logfs_inode_setops(inode);
- return inode;
-}
-
-static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- int ret;
- long flags = WF_LOCK;
-
- /* Can only happen if creat() failed. Safe to skip. */
- if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
- return 0;
-
- ret = __logfs_write_inode(inode, NULL, flags);
- LOGFS_BUG_ON(ret, inode->i_sb);
- return ret;
-}
-
-/* called with inode->i_lock held */
-static int logfs_drop_inode(struct inode *inode)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- struct logfs_inode *li = logfs_inode(inode);
-
- spin_lock(&logfs_inode_lock);
- list_move(&li->li_freeing_list, &super->s_freeing_list);
- spin_unlock(&logfs_inode_lock);
- return generic_drop_inode(inode);
-}
-
-static void logfs_set_ino_generation(struct super_block *sb,
- struct inode *inode)
-{
- struct logfs_super *super = logfs_super(sb);
- u64 ino;
-
- mutex_lock(&super->s_journal_mutex);
- ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
- super->s_last_ino = ino;
- super->s_inos_till_wrap--;
- if (super->s_inos_till_wrap < 0) {
- super->s_last_ino = LOGFS_RESERVED_INOS;
- super->s_generation++;
- super->s_inos_till_wrap = INOS_PER_WRAP;
- }
- inode->i_ino = ino;
- inode->i_generation = super->s_generation;
- mutex_unlock(&super->s_journal_mutex);
-}
-
-struct inode *logfs_new_inode(struct inode *dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct inode *inode;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- logfs_init_inode(sb, inode);
-
- /* inherit parent flags */
- logfs_inode(inode)->li_flags |=
- logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
-
- inode->i_mode = mode;
- logfs_set_ino_generation(sb, inode);
-
- inode_init_owner(inode, dir, mode);
- logfs_inode_setops(inode);
- insert_inode_hash(inode);
-
- return inode;
-}
-
-static void logfs_init_once(void *_li)
-{
- struct logfs_inode *li = _li;
- int i;
-
- li->li_flags = 0;
- li->li_used_bytes = 0;
- li->li_refcount = 1;
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- li->li_data[i] = 0;
- inode_init_once(&li->vfs_inode);
-}
-
-static int logfs_sync_fs(struct super_block *sb, int wait)
-{
- logfs_get_wblocks(sb, NULL, WF_LOCK);
- logfs_write_anchor(sb);
- logfs_put_wblocks(sb, NULL, WF_LOCK);
- return 0;
-}
-
-static void logfs_put_super(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- /* kill the meta-inodes */
- iput(super->s_segfile_inode);
- iput(super->s_master_inode);
- iput(super->s_mapping_inode);
-}
-
-const struct super_operations logfs_super_operations = {
- .alloc_inode = logfs_alloc_inode,
- .destroy_inode = logfs_destroy_inode,
- .evict_inode = logfs_evict_inode,
- .drop_inode = logfs_drop_inode,
- .put_super = logfs_put_super,
- .write_inode = logfs_write_inode,
- .statfs = logfs_statfs,
- .sync_fs = logfs_sync_fs,
-};
-
-int logfs_init_inode_cache(void)
-{
- logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
- logfs_init_once);
- if (!logfs_inode_cache)
- return -ENOMEM;
- return 0;
-}
-
-void logfs_destroy_inode_cache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(logfs_inode_cache);
-}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
deleted file mode 100644
index 2a09b8d73989..000000000000
--- a/fs/logfs/journal.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * fs/logfs/journal.c - journal handling code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-static void logfs_calc_free(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- u64 reserve, no_segs = super->s_no_segs;
- s64 free;
- int i;
-
- /* superblock segments */
- no_segs -= 2;
- super->s_no_journal_segs = 0;
- /* journal */
- journal_for_each(i)
- if (super->s_journal_seg[i]) {
- no_segs--;
- super->s_no_journal_segs++;
- }
-
- /* open segments plus one extra per level for GC */
- no_segs -= 2 * super->s_total_levels;
-
- free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
- free -= super->s_used_bytes;
- /* just a bit extra */
- free -= super->s_total_levels * 4096;
-
- /* Bad blocks are 'paid' for with speed reserve - the filesystem
- * simply gets slower as bad blocks accumulate. Until the bad blocks
- * exceed the speed reserve - then the filesystem gets smaller.
- */
- reserve = super->s_bad_segments + super->s_bad_seg_reserve;
- reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
- reserve = max(reserve, super->s_speed_reserve);
- free -= reserve;
- if (free < 0)
- free = 0;
-
- super->s_free_bytes = free;
-}
-
-static void reserve_sb_and_journal(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct btree_head32 *head = &super->s_reserved_segments;
- int i, err;
-
- err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
- GFP_KERNEL);
- BUG_ON(err);
-
- err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
- GFP_KERNEL);
- BUG_ON(err);
-
- journal_for_each(i) {
- if (!super->s_journal_seg[i])
- continue;
- err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
- GFP_KERNEL);
- BUG_ON(err);
- }
-}
-
-static void read_dynsb(struct super_block *sb,
- struct logfs_je_dynsb *dynsb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- super->s_gec = be64_to_cpu(dynsb->ds_gec);
- super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
- super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
- super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
- super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
- super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
- super->s_generation = be32_to_cpu(dynsb->ds_generation);
-}
-
-static void read_anchor(struct super_block *sb,
- struct logfs_je_anchor *da)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode = super->s_master_inode;
- struct logfs_inode *li = logfs_inode(inode);
- int i;
-
- super->s_last_ino = be64_to_cpu(da->da_last_ino);
- li->li_flags = 0;
- li->li_height = da->da_height;
- i_size_write(inode, be64_to_cpu(da->da_size));
- li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
-
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- li->li_data[i] = be64_to_cpu(da->da_data[i]);
-}
-
-static void read_erasecount(struct super_block *sb,
- struct logfs_je_journal_ec *ec)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- journal_for_each(i)
- super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
-}
-
-static int read_area(struct super_block *sb, struct logfs_je_area *a)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_area[a->gc_level];
- u64 ofs;
- u32 writemask = ~(super->s_writesize - 1);
-
- if (a->gc_level >= LOGFS_NO_AREAS)
- return -EIO;
- if (a->vim != VIM_DEFAULT)
- return -EIO; /* TODO: close area and continue */
-
- area->a_used_bytes = be32_to_cpu(a->used_bytes);
- area->a_written_bytes = area->a_used_bytes & writemask;
- area->a_segno = be32_to_cpu(a->segno);
- if (area->a_segno)
- area->a_is_open = 1;
-
- ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
- if (super->s_writesize > 1)
- return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
- else
- return logfs_buf_recover(area, ofs, NULL, 0);
-}
-
-static void *unpack(void *from, void *to)
-{
- struct logfs_journal_header *jh = from;
- void *data = from + sizeof(struct logfs_journal_header);
- int err;
- size_t inlen, outlen;
-
- inlen = be16_to_cpu(jh->h_len);
- outlen = be16_to_cpu(jh->h_datalen);
-
- if (jh->h_compr == COMPR_NONE)
- memcpy(to, data, inlen);
- else {
- err = logfs_uncompress(data, to, inlen, outlen);
- BUG_ON(err);
- }
- return to;
-}
-
-static int __read_je_header(struct super_block *sb, u64 ofs,
- struct logfs_journal_header *jh)
-{
- struct logfs_super *super = logfs_super(sb);
- size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
- + MAX_JOURNAL_HEADER;
- u16 type, len, datalen;
- int err;
-
- /* read header only */
- err = wbuf_read(sb, ofs, sizeof(*jh), jh);
- if (err)
- return err;
- type = be16_to_cpu(jh->h_type);
- len = be16_to_cpu(jh->h_len);
- datalen = be16_to_cpu(jh->h_datalen);
- if (len > sb->s_blocksize)
- return -EIO;
- if ((type < JE_FIRST) || (type > JE_LAST))
- return -EIO;
- if (datalen > bufsize)
- return -EIO;
- return 0;
-}
-
-static int __read_je_payload(struct super_block *sb, u64 ofs,
- struct logfs_journal_header *jh)
-{
- u16 len;
- int err;
-
- len = be16_to_cpu(jh->h_len);
- err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
- if (err)
- return err;
- if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
- /* Old code was confused. It forgot about the header length
- * and stopped calculating the crc 16 bytes before the end
- * of data - ick!
- * FIXME: Remove this hack once the old code is fixed.
- */
- if (jh->h_crc == logfs_crc32(jh, len, 4))
- WARN_ON_ONCE(1);
- else
- return -EIO;
- }
- return 0;
-}
-
-/*
- * jh needs to be large enough to hold the complete entry, not just the header
- */
-static int __read_je(struct super_block *sb, u64 ofs,
- struct logfs_journal_header *jh)
-{
- int err;
-
- err = __read_je_header(sb, ofs, jh);
- if (err)
- return err;
- return __read_je_payload(sb, ofs, jh);
-}
-
-static int read_je(struct super_block *sb, u64 ofs)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_journal_header *jh = super->s_compressed_je;
- void *scratch = super->s_je;
- u16 type, datalen;
- int err;
-
- err = __read_je(sb, ofs, jh);
- if (err)
- return err;
- type = be16_to_cpu(jh->h_type);
- datalen = be16_to_cpu(jh->h_datalen);
-
- switch (type) {
- case JE_DYNSB:
- read_dynsb(sb, unpack(jh, scratch));
- break;
- case JE_ANCHOR:
- read_anchor(sb, unpack(jh, scratch));
- break;
- case JE_ERASECOUNT:
- read_erasecount(sb, unpack(jh, scratch));
- break;
- case JE_AREA:
- err = read_area(sb, unpack(jh, scratch));
- break;
- case JE_OBJ_ALIAS:
- err = logfs_load_object_aliases(sb, unpack(jh, scratch),
- datalen);
- break;
- default:
- WARN_ON_ONCE(1);
- return -EIO;
- }
- return err;
-}
-
-static int logfs_read_segment(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_journal_header *jh = super->s_compressed_je;
- u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
- u32 h_ofs, last_ofs = 0;
- u16 len, datalen, last_len = 0;
- int i, err;
-
- /* search for most recent commit */
- for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
- ofs = seg_ofs + h_ofs;
- err = __read_je_header(sb, ofs, jh);
- if (err)
- continue;
- if (jh->h_type != cpu_to_be16(JE_COMMIT))
- continue;
- err = __read_je_payload(sb, ofs, jh);
- if (err)
- continue;
- len = be16_to_cpu(jh->h_len);
- datalen = be16_to_cpu(jh->h_datalen);
- if ((datalen > sizeof(super->s_je_array)) ||
- (datalen % sizeof(__be64)))
- continue;
- last_ofs = h_ofs;
- last_len = datalen;
- h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
- }
- /* read commit */
- if (last_ofs == 0)
- return -ENOENT;
- ofs = seg_ofs + last_ofs;
- log_journal("Read commit from %llx\n", ofs);
- err = __read_je(sb, ofs, jh);
- BUG_ON(err); /* We should have caught it in the scan loop already */
- if (err)
- return err;
- /* uncompress */
- unpack(jh, super->s_je_array);
- super->s_no_je = last_len / sizeof(__be64);
- /* iterate over array */
- for (i = 0; i < super->s_no_je; i++) {
- err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
- if (err)
- return err;
- }
- super->s_journal_area->a_segno = segno;
- return 0;
-}
-
-static u64 read_gec(struct super_block *sb, u32 segno)
-{
- struct logfs_segment_header sh;
- __be32 crc;
- int err;
-
- if (!segno)
- return 0;
- err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
- if (err)
- return 0;
- crc = logfs_crc32(&sh, sizeof(sh), 4);
- if (crc != sh.crc) {
- WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
- /* Most likely it was just erased */
- return 0;
- }
- return be64_to_cpu(sh.gec);
-}
-
-static int logfs_read_journal(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- u64 gec[LOGFS_JOURNAL_SEGS], max;
- u32 segno;
- int i, max_i;
-
- max = 0;
- max_i = -1;
- journal_for_each(i) {
- segno = super->s_journal_seg[i];
- gec[i] = read_gec(sb, super->s_journal_seg[i]);
- if (gec[i] > max) {
- max = gec[i];
- max_i = i;
- }
- }
- if (max_i == -1)
- return -EIO;
- /* FIXME: Try older segments in case of error */
- return logfs_read_segment(sb, super->s_journal_seg[max_i]);
-}
-
-/*
- * First search the current segment (outer loop), then pick the next segment
- * in the array, skipping any zero entries (inner loop).
- */
-static void journal_get_free_segment(struct logfs_area *area)
-{
- struct logfs_super *super = logfs_super(area->a_sb);
- int i;
-
- journal_for_each(i) {
- if (area->a_segno != super->s_journal_seg[i])
- continue;
-
- do {
- i++;
- if (i == LOGFS_JOURNAL_SEGS)
- i = 0;
- } while (!super->s_journal_seg[i]);
-
- area->a_segno = super->s_journal_seg[i];
- area->a_erase_count = ++(super->s_journal_ec[i]);
- log_journal("Journal now at %x (ec %x)\n", area->a_segno,
- area->a_erase_count);
- return;
- }
- BUG();
-}
-
-static void journal_get_erase_count(struct logfs_area *area)
-{
- /* erase count is stored globally and incremented in
- * journal_get_free_segment() - nothing to do here */
-}
-
-static int journal_erase_segment(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- union {
- struct logfs_segment_header sh;
- unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
- } u;
- u64 ofs;
- int err;
-
- err = logfs_erase_segment(sb, area->a_segno, 1);
- if (err)
- return err;
-
- memset(&u, 0, sizeof(u));
- u.sh.pad = 0;
- u.sh.type = SEG_JOURNAL;
- u.sh.level = 0;
- u.sh.segno = cpu_to_be32(area->a_segno);
- u.sh.ec = cpu_to_be32(area->a_erase_count);
- u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
- u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
-
- /* This causes a bug in segment.c. Not yet. */
- //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
-
- ofs = dev_ofs(sb, area->a_segno, 0);
- area->a_used_bytes = sizeof(u);
- logfs_buf_write(area, ofs, &u, sizeof(u));
- return 0;
-}
-
-static size_t __logfs_write_header(struct logfs_super *super,
- struct logfs_journal_header *jh, size_t len, size_t datalen,
- u16 type, u8 compr)
-{
- jh->h_len = cpu_to_be16(len);
- jh->h_type = cpu_to_be16(type);
- jh->h_datalen = cpu_to_be16(datalen);
- jh->h_compr = compr;
- jh->h_pad[0] = 'H';
- jh->h_pad[1] = 'E';
- jh->h_pad[2] = 'A';
- jh->h_pad[3] = 'D';
- jh->h_pad[4] = 'R';
- jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
- return ALIGN(len, 16) + sizeof(*jh);
-}
-
-static size_t logfs_write_header(struct logfs_super *super,
- struct logfs_journal_header *jh, size_t datalen, u16 type)
-{
- size_t len = datalen;
-
- return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
-}
-
-static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
-{
- return LOGFS_JOURNAL_SEGS * sizeof(__be32);
-}
-
-static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
- u16 *type, size_t *len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_je_journal_ec *ec = _ec;
- int i;
-
- journal_for_each(i)
- ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
- *type = JE_ERASECOUNT;
- *len = logfs_journal_erasecount_size(super);
- return ec;
-}
-
-static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
- size_t ignore2)
-{
- struct logfs_shadow *shadow = _shadow;
- struct super_block *sb = (void *)_sb;
- struct logfs_super *super = logfs_super(sb);
-
- /* consume new space */
- super->s_free_bytes -= shadow->new_len;
- super->s_used_bytes += shadow->new_len;
- super->s_dirty_used_bytes -= shadow->new_len;
-
- /* free up old space */
- super->s_free_bytes += shadow->old_len;
- super->s_used_bytes -= shadow->old_len;
- super->s_dirty_free_bytes -= shadow->old_len;
-
- logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
- logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
-
- log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
- shadow->ino, shadow->bix, shadow->gc_level,
- shadow->old_ofs, shadow->new_ofs,
- shadow->old_len, shadow->new_len);
- mempool_free(shadow, super->s_shadow_pool);
-}
-
-static void account_shadows(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode = super->s_master_inode;
- struct logfs_inode *li = logfs_inode(inode);
- struct shadow_tree *tree = &super->s_shadow_tree;
-
- btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
- btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
- btree_grim_visitor32(&tree->segment_map, 0, NULL);
- tree->no_shadowed_segments = 0;
-
- if (li->li_block) {
- /*
- * We never actually use the structure, when attached to the
- * master inode. But it is easier to always free it here than
- * to have checks in several places elsewhere when allocating
- * it.
- */
- li->li_block->ops->free_block(sb, li->li_block);
- }
- BUG_ON((s64)li->li_used_bytes < 0);
-}
-
-static void *__logfs_write_anchor(struct super_block *sb, void *_da,
- u16 *type, size_t *len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_je_anchor *da = _da;
- struct inode *inode = super->s_master_inode;
- struct logfs_inode *li = logfs_inode(inode);
- int i;
-
- da->da_height = li->li_height;
- da->da_last_ino = cpu_to_be64(super->s_last_ino);
- da->da_size = cpu_to_be64(i_size_read(inode));
- da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- da->da_data[i] = cpu_to_be64(li->li_data[i]);
- *type = JE_ANCHOR;
- *len = sizeof(*da);
- return da;
-}
-
-static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
- u16 *type, size_t *len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_je_dynsb *dynsb = _dynsb;
-
- dynsb->ds_gec = cpu_to_be64(super->s_gec);
- dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
- dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
- dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
- dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
- dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
- dynsb->ds_generation = cpu_to_be32(super->s_generation);
- *type = JE_DYNSB;
- *len = sizeof(*dynsb);
- return dynsb;
-}
-
-static void write_wbuf(struct super_block *sb, struct logfs_area *area,
- void *wbuf)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- u64 ofs;
- pgoff_t index;
- int page_ofs;
- struct page *page;
-
- ofs = dev_ofs(sb, area->a_segno,
- area->a_used_bytes & ~(super->s_writesize - 1));
- index = ofs >> PAGE_SHIFT;
- page_ofs = ofs & (PAGE_SIZE - 1);
-
- page = find_or_create_page(mapping, index, GFP_NOFS);
- BUG_ON(!page);
- memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
- unlock_page(page);
-}
-
-static void *logfs_write_area(struct super_block *sb, void *_a,
- u16 *type, size_t *len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_area[super->s_sum_index];
- struct logfs_je_area *a = _a;
-
- a->vim = VIM_DEFAULT;
- a->gc_level = super->s_sum_index;
- a->used_bytes = cpu_to_be32(area->a_used_bytes);
- a->segno = cpu_to_be32(area->a_segno);
- if (super->s_writesize > 1)
- write_wbuf(sb, area, a + 1);
-
- *type = JE_AREA;
- *len = sizeof(*a) + super->s_writesize;
- return a;
-}
-
-static void *logfs_write_commit(struct super_block *sb, void *h,
- u16 *type, size_t *len)
-{
- struct logfs_super *super = logfs_super(sb);
-
- *type = JE_COMMIT;
- *len = super->s_no_je * sizeof(__be64);
- return super->s_je_array;
-}
-
-static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
- size_t len)
-{
- struct logfs_super *super = logfs_super(sb);
- void *header = super->s_compressed_je;
- void *data = header + sizeof(struct logfs_journal_header);
- ssize_t compr_len, pad_len;
- u8 compr = COMPR_ZLIB;
-
- if (len == 0)
- return logfs_write_header(super, header, 0, type);
-
- compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
- if (compr_len < 0 || type == JE_ANCHOR) {
- memcpy(data, buf, len);
- compr_len = len;
- compr = COMPR_NONE;
- }
-
- pad_len = ALIGN(compr_len, 16);
- memset(data + compr_len, 0, pad_len - compr_len);
-
- return __logfs_write_header(super, header, compr_len, len, type, compr);
-}
-
-static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
- int must_pad)
-{
- u32 writesize = logfs_super(area->a_sb)->s_writesize;
- s32 ofs;
- int ret;
-
- ret = logfs_open_area(area, *bytes);
- if (ret)
- return -EAGAIN;
-
- ofs = area->a_used_bytes;
- area->a_used_bytes += *bytes;
-
- if (must_pad) {
- area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
- *bytes = area->a_used_bytes - ofs;
- }
-
- return dev_ofs(area->a_sb, area->a_segno, ofs);
-}
-
-static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
- size_t buf_len)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_journal_area;
- struct logfs_journal_header *jh = super->s_compressed_je;
- size_t len;
- int must_pad = 0;
- s64 ofs;
-
- len = __logfs_write_je(sb, buf, type, buf_len);
- if (jh->h_type == cpu_to_be16(JE_COMMIT))
- must_pad = 1;
-
- ofs = logfs_get_free_bytes(area, &len, must_pad);
- if (ofs < 0)
- return ofs;
- logfs_buf_write(area, ofs, super->s_compressed_je, len);
- BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
- super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
- return 0;
-}
-
-static int logfs_write_je(struct super_block *sb,
- void* (*write)(struct super_block *sb, void *scratch,
- u16 *type, size_t *len))
-{
- void *buf;
- size_t len;
- u16 type;
-
- buf = write(sb, logfs_super(sb)->s_je, &type, &len);
- return logfs_write_je_buf(sb, buf, type, len);
-}
-
-int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
- level_t level, int child_no, __be64 val)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_obj_alias *oa = super->s_je;
- int err = 0, fill = super->s_je_fill;
-
- log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
- fill, ino, bix, level, child_no, be64_to_cpu(val));
- oa[fill].ino = cpu_to_be64(ino);
- oa[fill].bix = cpu_to_be64(bix);
- oa[fill].val = val;
- oa[fill].level = (__force u8)level;
- oa[fill].child_no = cpu_to_be16(child_no);
- fill++;
- if (fill >= sb->s_blocksize / sizeof(*oa)) {
- err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
- fill = 0;
- }
-
- super->s_je_fill = fill;
- return err;
-}
-
-static int logfs_write_obj_aliases(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int err;
-
- log_journal("logfs_write_obj_aliases: %d aliases to write\n",
- super->s_no_object_aliases);
- super->s_je_fill = 0;
- err = logfs_write_obj_aliases_pagecache(sb);
- if (err)
- return err;
-
- if (super->s_je_fill)
- err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
- super->s_je_fill
- * sizeof(struct logfs_obj_alias));
- return err;
-}
-
-/*
- * Write all journal entries. The goto logic ensures that all journal entries
- * are written whenever a new segment is used. It is ugly and potentially a
- * bit wasteful, but robustness is more important. With this we can *always*
- * erase all journal segments except the one containing the most recent commit.
- */
-void logfs_write_anchor(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_journal_area;
- int i, err;
-
- if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
- return;
- super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
-
- BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
- mutex_lock(&super->s_journal_mutex);
-
- /* Do this first or suffer corruption */
- logfs_sync_segments(sb);
- account_shadows(sb);
-
-again:
- super->s_no_je = 0;
- for_each_area(i) {
- if (!super->s_area[i]->a_is_open)
- continue;
- super->s_sum_index = i;
- err = logfs_write_je(sb, logfs_write_area);
- if (err)
- goto again;
- }
- err = logfs_write_obj_aliases(sb);
- if (err)
- goto again;
- err = logfs_write_je(sb, logfs_write_erasecount);
- if (err)
- goto again;
- err = logfs_write_je(sb, __logfs_write_anchor);
- if (err)
- goto again;
- err = logfs_write_je(sb, logfs_write_dynsb);
- if (err)
- goto again;
- /*
- * Order is imperative. First we sync all writes, including the
- * non-committed journal writes. Then we write the final commit and
- * sync the current journal segment.
- * There is a theoretical bug here. Syncing the journal segment will
- * write a number of journal entries and the final commit. All these
- * are written in a single operation. If the device layer writes the
- * data back-to-front, the commit will precede the other journal
- * entries, leaving a race window.
- * Two fixes are possible. Preferred is to fix the device layer to
- * ensure writes happen front-to-back. Alternatively we can insert
- * another logfs_sync_area() super->s_devops->sync() combo before
- * writing the commit.
- */
- /*
- * On another subject, super->s_devops->sync is usually not necessary.
- * Unless called from sys_sync or friends, a barrier would suffice.
- */
- super->s_devops->sync(sb);
- err = logfs_write_je(sb, logfs_write_commit);
- if (err)
- goto again;
- log_journal("Write commit to %llx\n",
- be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
- logfs_sync_area(area);
- BUG_ON(area->a_used_bytes != area->a_written_bytes);
- super->s_devops->sync(sb);
-
- mutex_unlock(&super->s_journal_mutex);
- return;
-}
-
-void do_logfs_journal_wl_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_journal_area;
- struct btree_head32 *head = &super->s_reserved_segments;
- u32 segno, ec;
- int i, err;
-
- log_journal("Journal requires wear-leveling.\n");
- /* Drop old segments */
- journal_for_each(i)
- if (super->s_journal_seg[i]) {
- btree_remove32(head, super->s_journal_seg[i]);
- logfs_set_segment_unreserved(sb,
- super->s_journal_seg[i],
- super->s_journal_ec[i]);
- super->s_journal_seg[i] = 0;
- super->s_journal_ec[i] = 0;
- }
- /* Get new segments */
- for (i = 0; i < super->s_no_journal_segs; i++) {
- segno = get_best_cand(sb, &super->s_reserve_list, &ec);
- super->s_journal_seg[i] = segno;
- super->s_journal_ec[i] = ec;
- logfs_set_segment_reserved(sb, segno);
- err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
- BUG_ON(err); /* mempool should prevent this */
- err = logfs_erase_segment(sb, segno, 1);
- BUG_ON(err); /* FIXME: remount-ro would be nicer */
- }
- /* Manually move journal_area */
- freeseg(sb, area->a_segno);
- area->a_segno = super->s_journal_seg[0];
- area->a_is_open = 0;
- area->a_used_bytes = 0;
- /* Write journal */
- logfs_write_anchor(sb);
- /* Write superblocks */
- err = logfs_write_sb(sb);
- BUG_ON(err);
-}
-
-static const struct logfs_area_ops journal_area_ops = {
- .get_free_segment = journal_get_free_segment,
- .get_erase_count = journal_get_erase_count,
- .erase_segment = journal_erase_segment,
-};
-
-int logfs_init_journal(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
- + MAX_JOURNAL_HEADER;
- int ret = -ENOMEM;
-
- mutex_init(&super->s_journal_mutex);
- btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
-
- super->s_je = kzalloc(bufsize, GFP_KERNEL);
- if (!super->s_je)
- return ret;
-
- super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
- if (!super->s_compressed_je)
- return ret;
-
- super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
- if (IS_ERR(super->s_master_inode))
- return PTR_ERR(super->s_master_inode);
-
- ret = logfs_read_journal(sb);
- if (ret)
- return -EIO;
-
- reserve_sb_and_journal(sb);
- logfs_calc_free(sb);
-
- super->s_journal_area->a_ops = &journal_area_ops;
- return 0;
-}
-
-void logfs_cleanup_journal(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
-
- kfree(super->s_compressed_je);
- kfree(super->s_je);
-}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
deleted file mode 100644
index 27d040e35faa..000000000000
--- a/fs/logfs/logfs.h
+++ /dev/null
@@ -1,735 +0,0 @@
-/*
- * fs/logfs/logfs.h
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Private header for logfs.
- */
-#ifndef FS_LOGFS_LOGFS_H
-#define FS_LOGFS_LOGFS_H
-
-#undef __CHECK_ENDIAN__
-#define __CHECK_ENDIAN__
-
-#include <linux/btree.h>
-#include <linux/crc32.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/mempool.h>
-#include <linux/pagemap.h>
-#include <linux/mtd/mtd.h>
-#include "logfs_abi.h"
-
-#define LOGFS_DEBUG_SUPER (0x0001)
-#define LOGFS_DEBUG_SEGMENT (0x0002)
-#define LOGFS_DEBUG_JOURNAL (0x0004)
-#define LOGFS_DEBUG_DIR (0x0008)
-#define LOGFS_DEBUG_FILE (0x0010)
-#define LOGFS_DEBUG_INODE (0x0020)
-#define LOGFS_DEBUG_READWRITE (0x0040)
-#define LOGFS_DEBUG_GC (0x0080)
-#define LOGFS_DEBUG_GC_NOISY (0x0100)
-#define LOGFS_DEBUG_ALIASES (0x0200)
-#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
-#define LOGFS_DEBUG_ALL (0xffffffff)
-
-#define LOGFS_DEBUG (0x01)
-/*
- * To enable specific log messages, simply define LOGFS_DEBUG to match any
- * or all of the above.
- */
-#ifndef LOGFS_DEBUG
-#define LOGFS_DEBUG (0)
-#endif
-
-#define log_cond(cond, fmt, arg...) do { \
- if (cond) \
- printk(KERN_DEBUG fmt, ##arg); \
-} while (0)
-
-#define log_super(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
-#define log_segment(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
-#define log_journal(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
-#define log_dir(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
-#define log_file(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
-#define log_inode(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
-#define log_readwrite(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
-#define log_gc(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
-#define log_gc_noisy(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
-#define log_aliases(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
-#define log_blockmove(fmt, arg...) \
- log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
-
-#define PG_pre_locked PG_owner_priv_1
-#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
-#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
-#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
-
-/* FIXME: This should really be somewhere in the 64bit area. */
-#define LOGFS_LINK_MAX (1<<30)
-
-/* Read-only filesystem */
-#define LOGFS_SB_FLAG_RO 0x0001
-#define LOGFS_SB_FLAG_DIRTY 0x0002
-#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
-#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
-
-/* Write Control Flags */
-#define WF_LOCK 0x01 /* take write lock */
-#define WF_WRITE 0x02 /* write block */
-#define WF_DELETE 0x04 /* delete old block */
-
-typedef u8 __bitwise level_t;
-typedef u8 __bitwise gc_level_t;
-
-#define LEVEL(level) ((__force level_t)(level))
-#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
-
-#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
- (__force level_t)((__force u8)(level) - 1) )
-
-/**
- * struct logfs_area - area management information
- *
- * @a_sb: the superblock this area belongs to
- * @a_is_open: 1 if the area is currently open, else 0
- * @a_segno: segment number of area
- * @a_written_bytes: number of bytes already written back
- * @a_used_bytes: number of used bytes
- * @a_ops: area operations (either journal or ostore)
- * @a_erase_count: erase count
- * @a_level: GC level
- */
-struct logfs_area { /* a segment open for writing */
- struct super_block *a_sb;
- int a_is_open;
- u32 a_segno;
- u32 a_written_bytes;
- u32 a_used_bytes;
- const struct logfs_area_ops *a_ops;
- u32 a_erase_count;
- gc_level_t a_level;
-};
-
-/**
- * struct logfs_area_ops - area operations
- *
- * @get_free_segment: fill area->ofs with the offset of a free segment
- * @get_erase_count: fill area->erase_count (needs area->ofs)
- * @erase_segment: erase and setup segment
- */
-struct logfs_area_ops {
- void (*get_free_segment)(struct logfs_area *area);
- void (*get_erase_count)(struct logfs_area *area);
- int (*erase_segment)(struct logfs_area *area);
-};
-
-struct logfs_super; /* forward */
-/**
- * struct logfs_device_ops - device access operations
- *
- * @readpage: read one page (mm page)
- * @writeseg: write one segment. may be a partial segment
- * @erase: erase one segment
- * @read: read from the device
- * @erase: erase part of the device
- * @can_write_buf: decide whether wbuf can be written to ofs
- */
-struct logfs_device_ops {
- struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
- struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
- int (*write_sb)(struct super_block *sb, struct page *page);
- int (*readpage)(void *_sb, struct page *page);
- void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
- int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
- int ensure_write);
- int (*can_write_buf)(struct super_block *sb, u64 ofs);
- void (*sync)(struct super_block *sb);
- void (*put_device)(struct logfs_super *s);
-};
-
-/**
- * struct candidate_list - list of similar candidates
- */
-struct candidate_list {
- struct rb_root rb_tree;
- int count;
- int maxcount;
- int sort_by_ec;
-};
-
-/**
- * struct gc_candidate - "candidate" segment to be garbage collected next
- *
- * @list: list (either free of low)
- * @segno: segment number
- * @valid: number of valid bytes
- * @erase_count: erase count of segment
- * @dist: distance from tree root
- *
- * Candidates can be on two lists. The free list contains electees rather
- * than candidates - segments that no longer contain any valid data. The
- * low list contains candidates to be picked for GC. It should be kept
- * short. It is not required to always pick a perfect candidate. In the
- * worst case GC will have to move more data than absolutely necessary.
- */
-struct gc_candidate {
- struct rb_node rb_node;
- struct candidate_list *list;
- u32 segno;
- u32 valid;
- u32 erase_count;
- u8 dist;
-};
-
-/**
- * struct logfs_journal_entry - temporary structure used during journal scan
- *
- * @used:
- * @version: normalized version
- * @len: length
- * @offset: offset
- */
-struct logfs_journal_entry {
- int used;
- s16 version;
- u16 len;
- u16 datalen;
- u64 offset;
-};
-
-enum transaction_state {
- CREATE_1 = 1,
- CREATE_2,
- UNLINK_1,
- UNLINK_2,
- CROSS_RENAME_1,
- CROSS_RENAME_2,
- TARGET_RENAME_1,
- TARGET_RENAME_2,
- TARGET_RENAME_3
-};
-
-/**
- * struct logfs_transaction - essential fields to support atomic dirops
- *
- * @ino: target inode
- * @dir: inode of directory containing dentry
- * @pos: pos of dentry in directory
- */
-struct logfs_transaction {
- enum transaction_state state;
- u64 ino;
- u64 dir;
- u64 pos;
-};
-
-/**
- * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
- * @old_ofs: offset of old block on medium
- * @new_ofs: offset of new block on medium
- * @ino: inode number
- * @bix: block index
- * @old_len: size of old block, including header
- * @new_len: size of new block, including header
- * @level: block level
- */
-struct logfs_shadow {
- u64 old_ofs;
- u64 new_ofs;
- u64 ino;
- u64 bix;
- int old_len;
- int new_len;
- gc_level_t gc_level;
-};
-
-/**
- * struct shadow_tree
- * @new: shadows where old_ofs==0, indexed by new_ofs
- * @old: shadows where old_ofs!=0, indexed by old_ofs
- * @segment_map: bitfield of segments containing shadows
- * @no_shadowed_segment: number of segments containing shadows
- */
-struct shadow_tree {
- struct btree_head64 new;
- struct btree_head64 old;
- struct btree_head32 segment_map;
- int no_shadowed_segments;
-};
-
-struct object_alias_item {
- struct list_head list;
- __be64 val;
- int child_no;
-};
-
-/**
- * struct logfs_block - contains any block state
- * @type: indirect block or inode
- * @full: number of fully populated children
- * @partial: number of partially populated children
- *
- * Most blocks are directly represented by page cache pages. But when a block
- * becomes dirty, is part of a transaction, contains aliases or is otherwise
- * special, a struct logfs_block is allocated to track the additional state.
- * Inodes are very similar to indirect blocks, so they can also get one of
- * these structures added when appropriate.
- */
-#define BLOCK_INDIRECT 1 /* Indirect block */
-#define BLOCK_INODE 2 /* Inode */
-struct logfs_block_ops;
-struct logfs_block {
- struct list_head alias_list;
- struct list_head item_list;
- struct super_block *sb;
- u64 ino;
- u64 bix;
- level_t level;
- struct page *page;
- struct inode *inode;
- struct logfs_transaction *ta;
- unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
- const struct logfs_block_ops *ops;
- int full;
- int partial;
- int reserved_bytes;
-};
-
-typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
- level_t level, int child_no, __be64 val);
-struct logfs_block_ops {
- void (*write_block)(struct logfs_block *block);
- void (*free_block)(struct super_block *sb, struct logfs_block*block);
- int (*write_alias)(struct super_block *sb,
- struct logfs_block *block,
- write_alias_t *write_one_alias);
-};
-
-#define MAX_JOURNAL_ENTRIES 256
-
-struct logfs_super {
- struct mtd_info *s_mtd; /* underlying device */
- struct block_device *s_bdev; /* underlying device */
- const struct logfs_device_ops *s_devops;/* device access */
- struct inode *s_master_inode; /* inode file */
- struct inode *s_segfile_inode; /* segment file */
- struct inode *s_mapping_inode; /* device mapping */
- atomic_t s_pending_writes; /* outstanting bios */
- long s_flags;
- mempool_t *s_btree_pool; /* for btree nodes */
- mempool_t *s_alias_pool; /* aliases in segment.c */
- u64 s_feature_incompat;
- u64 s_feature_ro_compat;
- u64 s_feature_compat;
- u64 s_feature_flags;
- u64 s_sb_ofs[2];
- struct page *s_erase_page; /* for dev_bdev.c */
- /* alias.c fields */
- struct btree_head32 s_segment_alias; /* remapped segments */
- int s_no_object_aliases;
- struct list_head s_object_alias; /* remapped objects */
- struct btree_head128 s_object_alias_tree; /* remapped objects */
- struct mutex s_object_alias_mutex;
- /* dir.c fields */
- struct mutex s_dirop_mutex; /* for creat/unlink/rename */
- u64 s_victim_ino; /* used for atomic dir-ops */
- u64 s_rename_dir; /* source directory ino */
- u64 s_rename_pos; /* position of source dd */
- /* gc.c fields */
- long s_segsize; /* size of a segment */
- int s_segshift; /* log2 of segment size */
- long s_segmask; /* 1 << s_segshift - 1 */
- long s_no_segs; /* segments on device */
- long s_no_journal_segs; /* segments used for journal */
- long s_no_blocks; /* blocks per segment */
- long s_writesize; /* minimum write size */
- int s_writeshift; /* log2 of write size */
- u64 s_size; /* filesystem size */
- struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
- u64 s_gec; /* global erase count */
- u64 s_wl_gec_ostore; /* time of last wl event */
- u64 s_wl_gec_journal; /* time of last wl event */
- u64 s_sweeper; /* current sweeper pos */
- u8 s_ifile_levels; /* max level of ifile */
- u8 s_iblock_levels; /* max level of regular files */
- u8 s_data_levels; /* # of segments to leaf block*/
- u8 s_total_levels; /* sum of above three */
- struct btree_head32 s_cand_tree; /* all candidates */
- struct candidate_list s_free_list; /* 100% free segments */
- struct candidate_list s_reserve_list; /* Bad segment reserve */
- struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
- struct candidate_list s_ec_list; /* wear level candidates */
- struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
- /* inode.c fields */
- u64 s_last_ino; /* highest ino used */
- long s_inos_till_wrap;
- u32 s_generation; /* i_generation for new files */
- struct list_head s_freeing_list; /* inodes being freed */
- /* journal.c fields */
- struct mutex s_journal_mutex;
- void *s_je; /* journal entry to compress */
- void *s_compressed_je; /* block to write to journal */
- u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
- u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
- u64 s_last_version;
- struct logfs_area *s_journal_area; /* open journal segment */
- __be64 s_je_array[MAX_JOURNAL_ENTRIES];
- int s_no_je;
-
- int s_sum_index; /* for the 12 summaries */
- struct shadow_tree s_shadow_tree;
- int s_je_fill; /* index of current je */
- /* readwrite.c fields */
- struct mutex s_write_mutex;
- int s_lock_count;
- mempool_t *s_block_pool; /* struct logfs_block pool */
- mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
- struct list_head s_writeback_list; /* writeback pages */
- /*
- * Space accounting:
- * - s_used_bytes specifies space used to store valid data objects.
- * - s_dirty_used_bytes is space used to store non-committed data
- * objects. Those objects have already been written themselves,
- * but they don't become valid until all indirect blocks up to the
- * journal have been written as well.
- * - s_dirty_free_bytes is space used to store the old copy of a
- * replaced object, as long as the replacement is non-committed.
- * In other words, it is the amount of space freed when all dirty
- * blocks are written back.
- * - s_free_bytes is the amount of free space available for any
- * purpose.
- * - s_root_reserve is the amount of free space available only to
- * the root user. Non-privileged users can no longer write once
- * this watermark has been reached.
- * - s_speed_reserve is space which remains unused to speed up
- * garbage collection performance.
- * - s_dirty_pages is the space reserved for currently dirty pages.
- * It is a pessimistic estimate, so some/most will get freed on
- * page writeback.
- *
- * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
- */
- u64 s_free_bytes;
- u64 s_used_bytes;
- u64 s_dirty_free_bytes;
- u64 s_dirty_used_bytes;
- u64 s_root_reserve;
- u64 s_speed_reserve;
- u64 s_dirty_pages;
- /* Bad block handling:
- * - s_bad_seg_reserve is a number of segments usually kept
- * free. When encountering bad blocks, the affected segment's data
- * is _temporarily_ moved to a reserved segment.
- * - s_bad_segments is the number of known bad segments.
- */
- u32 s_bad_seg_reserve;
- u32 s_bad_segments;
-};
-
-/**
- * struct logfs_inode - in-memory inode
- *
- * @vfs_inode: struct inode
- * @li_data: data pointers
- * @li_used_bytes: number of used bytes
- * @li_freeing_list: used to track inodes currently being freed
- * @li_flags: inode flags
- * @li_refcount: number of internal (GC-induced) references
- */
-struct logfs_inode {
- struct inode vfs_inode;
- u64 li_data[LOGFS_EMBEDDED_FIELDS];
- u64 li_used_bytes;
- struct list_head li_freeing_list;
- struct logfs_block *li_block;
- u32 li_flags;
- u8 li_height;
- int li_refcount;
-};
-
-#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
-#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
-#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
-
-/* compr.c */
-int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
-int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
-int __init logfs_compr_init(void);
-void logfs_compr_exit(void);
-
-/* dev_bdev.c */
-#ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct logfs_super *s,
- struct file_system_type *type,
- const char *devname);
-#else
-static inline int logfs_get_sb_bdev(struct logfs_super *s,
- struct file_system_type *type,
- const char *devname)
-{
- return -ENODEV;
-}
-#endif
-
-/* dev_mtd.c */
-#if IS_ENABLED(CONFIG_MTD)
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
-#else
-static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-{
- return -ENODEV;
-}
-#endif
-
-/* dir.c */
-extern const struct inode_operations logfs_dir_iops;
-extern const struct file_operations logfs_dir_fops;
-int logfs_replay_journal(struct super_block *sb);
-
-/* file.c */
-extern const struct inode_operations logfs_reg_iops;
-extern const struct file_operations logfs_reg_fops;
-extern const struct address_space_operations logfs_reg_aops;
-int logfs_readpage(struct file *file, struct page *page);
-long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
-
-/* gc.c */
-u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
-void logfs_gc_pass(struct super_block *sb);
-int logfs_check_areas(struct super_block *sb);
-int logfs_init_gc(struct super_block *sb);
-void logfs_cleanup_gc(struct super_block *sb);
-
-/* inode.c */
-extern const struct super_operations logfs_super_operations;
-struct inode *logfs_iget(struct super_block *sb, ino_t ino);
-struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
-void logfs_safe_iput(struct inode *inode, int cookie);
-struct inode *logfs_new_inode(struct inode *dir, umode_t mode);
-struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
-struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
-int logfs_init_inode_cache(void);
-void logfs_destroy_inode_cache(void);
-void logfs_set_blocks(struct inode *inode, u64 no);
-/* these logically belong into inode.c but actually reside in readwrite.c */
-int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, struct page *, long flags);
-void logfs_evict_inode(struct inode *inode);
-
-/* journal.c */
-void logfs_write_anchor(struct super_block *sb);
-int logfs_init_journal(struct super_block *sb);
-void logfs_cleanup_journal(struct super_block *sb);
-int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
- level_t level, int child_no, __be64 val);
-void do_logfs_journal_wl_pass(struct super_block *sb);
-
-/* readwrite.c */
-pgoff_t logfs_pack_index(u64 bix, level_t level);
-void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
-int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
- loff_t bix, long flags, struct shadow_tree *shadow_tree);
-int logfs_readpage_nolock(struct page *page);
-int logfs_write_buf(struct inode *inode, struct page *page, long flags);
-int logfs_delete(struct inode *inode, pgoff_t index,
- struct shadow_tree *shadow_tree);
-int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
- gc_level_t gc_level, long flags);
-int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
- gc_level_t gc_level);
-int logfs_truncate(struct inode *inode, u64 size);
-u64 logfs_seek_hole(struct inode *inode, u64 bix);
-u64 logfs_seek_data(struct inode *inode, u64 bix);
-int logfs_open_segfile(struct super_block *sb);
-int logfs_init_rw(struct super_block *sb);
-void logfs_cleanup_rw(struct super_block *sb);
-void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
-void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
-void logfs_write_block(struct logfs_block *block, long flags);
-int logfs_write_obj_aliases_pagecache(struct super_block *sb);
-void logfs_get_segment_entry(struct super_block *sb, u32 segno,
- struct logfs_segment_entry *se);
-void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
-void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
- gc_level_t gc_level);
-void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
-void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
-struct logfs_block *__alloc_block(struct super_block *sb,
- u64 ino, u64 bix, level_t level);
-void __free_block(struct super_block *sb, struct logfs_block *block);
-void btree_write_block(struct logfs_block *block);
-void initialize_block_counters(struct page *page, struct logfs_block *block,
- __be64 *array, int page_is_empty);
-int logfs_exist_block(struct inode *inode, u64 bix);
-int get_page_reserve(struct inode *inode, struct page *page);
-void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
-void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern const struct logfs_block_ops indirect_block_ops;
-
-/* segment.c */
-int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
-int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
-int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
- level_t level);
-int logfs_segment_write(struct inode *inode, struct page *page,
- struct logfs_shadow *shadow);
-int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
-int logfs_load_object_aliases(struct super_block *sb,
- struct logfs_obj_alias *oa, int count);
-void move_page_to_btree(struct page *page);
-int logfs_init_mapping(struct super_block *sb);
-void logfs_sync_area(struct logfs_area *area);
-void logfs_sync_segments(struct super_block *sb);
-void freeseg(struct super_block *sb, u32 segno);
-void free_areas(struct super_block *sb);
-
-/* area handling */
-int logfs_init_areas(struct super_block *sb);
-void logfs_cleanup_areas(struct super_block *sb);
-int logfs_open_area(struct logfs_area *area, size_t bytes);
-int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
- int use_filler);
-
-static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
- void *buf, size_t len)
-{
- return __logfs_buf_write(area, ofs, buf, len, 0);
-}
-
-static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
- void *buf, size_t len)
-{
- return __logfs_buf_write(area, ofs, buf, len, 1);
-}
-
-/* super.c */
-struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
-void emergency_read_end(struct page *page);
-void logfs_crash_dump(struct super_block *sb);
-int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_check_ds(struct logfs_disk_super *ds);
-int logfs_write_sb(struct super_block *sb);
-
-static inline struct logfs_super *logfs_super(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-static inline struct logfs_inode *logfs_inode(struct inode *inode)
-{
- return container_of(inode, struct logfs_inode, vfs_inode);
-}
-
-static inline void logfs_set_ro(struct super_block *sb)
-{
- logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
-}
-
-#define LOGFS_BUG(sb) do { \
- struct super_block *__sb = sb; \
- logfs_crash_dump(__sb); \
- logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
- BUG(); \
-} while (0)
-
-#define LOGFS_BUG_ON(condition, sb) \
- do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
-
-static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
-{
- return cpu_to_be32(crc32(~0, data+skip, len-skip));
-}
-
-static inline u8 logfs_type(struct inode *inode)
-{
- return (inode->i_mode >> 12) & 15;
-}
-
-static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
-{
- return pos >> sb->s_blocksize_bits;
-}
-
-static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
-{
- return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
-}
-
-static inline u32 seg_no(struct super_block *sb, u64 ofs)
-{
- return ofs >> logfs_super(sb)->s_segshift;
-}
-
-static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
-{
- return ofs & logfs_super(sb)->s_segmask;
-}
-
-static inline u64 seg_align(struct super_block *sb, u64 ofs)
-{
- return ofs & ~logfs_super(sb)->s_segmask;
-}
-
-static inline struct logfs_block *logfs_block(struct page *page)
-{
- return (void *)page->private;
-}
-
-static inline level_t shrink_level(gc_level_t __level)
-{
- u8 level = (__force u8)__level;
-
- if (level >= LOGFS_MAX_LEVELS)
- level -= LOGFS_MAX_LEVELS;
- return (__force level_t)level;
-}
-
-static inline gc_level_t expand_level(u64 ino, level_t __level)
-{
- u8 level = (__force u8)__level;
-
- if (ino == LOGFS_INO_MASTER) {
- /* ifile has separate areas */
- level += LOGFS_MAX_LEVELS;
- }
- return (__force gc_level_t)level;
-}
-
-static inline int logfs_block_shift(struct super_block *sb, level_t level)
-{
- level = shrink_level((__force gc_level_t)level);
- return (__force int)level * (sb->s_blocksize_bits - 3);
-}
-
-static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
-{
- return ~0ull << logfs_block_shift(sb, level);
-}
-
-static inline struct logfs_area *get_area(struct super_block *sb,
- gc_level_t gc_level)
-{
- return logfs_super(sb)->s_area[(__force u8)gc_level];
-}
-
-static inline void logfs_mempool_destroy(mempool_t *pool)
-{
- if (pool)
- mempool_destroy(pool);
-}
-
-#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
deleted file mode 100644
index ae960519c54a..000000000000
--- a/fs/logfs/logfs_abi.h
+++ /dev/null
@@ -1,629 +0,0 @@
-/*
- * fs/logfs/logfs_abi.h
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Public header for logfs.
- */
-#ifndef FS_LOGFS_LOGFS_ABI_H
-#define FS_LOGFS_LOGFS_ABI_H
-
-/* For out-of-kernel compiles */
-#ifndef BUILD_BUG_ON
-#define BUILD_BUG_ON(condition) /**/
-#endif
-
-#define SIZE_CHECK(type, size) \
-static inline void check_##type(void) \
-{ \
- BUILD_BUG_ON(sizeof(struct type) != (size)); \
-}
-
-/*
- * Throughout the logfs code, we're constantly dealing with blocks at
- * various positions or offsets. To remove confusion, we stricly
- * distinguish between a "position" - the logical position within a
- * file and an "offset" - the physical location within the device.
- *
- * Any usage of the term offset for a logical location or position for
- * a physical one is a bug and should get fixed.
- */
-
-/*
- * Block are allocated in one of several segments depending on their
- * level. The following levels are used:
- * 0 - regular data block
- * 1 - i1 indirect blocks
- * 2 - i2 indirect blocks
- * 3 - i3 indirect blocks
- * 4 - i4 indirect blocks
- * 5 - i5 indirect blocks
- * 6 - ifile data blocks
- * 7 - ifile i1 indirect blocks
- * 8 - ifile i2 indirect blocks
- * 9 - ifile i3 indirect blocks
- * 10 - ifile i4 indirect blocks
- * 11 - ifile i5 indirect blocks
- * Potential levels to be used in the future:
- * 12 - gc recycled blocks, long-lived data
- * 13 - replacement blocks, short-lived data
- *
- * Levels 1-11 are necessary for robust gc operations and help separate
- * short-lived metadata from longer-lived file data. In the future,
- * file data should get separated into several segments based on simple
- * heuristics. Old data recycled during gc operation is expected to be
- * long-lived. New data is of uncertain life expectancy. New data
- * used to replace older blocks in existing files is expected to be
- * short-lived.
- */
-
-
-/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
-#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
-#define LOGFS_MAGIC_U32 0xc97e8168u
-
-/*
- * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
- * Sooner or later that should become configurable and the macros replaced
- * by something superblock-dependent. Pointers in indirect blocks are and
- * will remain 64bit.
- *
- * LOGFS_BLOCKSIZE - self-explaining
- * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
- * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
- */
-#define LOGFS_BLOCKSIZE (4096ull)
-#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
-#define LOGFS_BLOCK_BITS (9)
-
-/*
- * Number of blocks at various levels of indirection. There are 16 direct
- * block pointers plus a single indirect pointer.
- */
-#define I0_BLOCKS (16)
-#define I1_BLOCKS LOGFS_BLOCK_FACTOR
-#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
-#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
-#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
-#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
-
-#define INDIRECT_INDEX I0_BLOCKS
-#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
-
-/*
- * Sizes at which files require another level of indirection. Files smaller
- * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
- * similar like ext2 fast symlinks.
- *
- * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
- * direct pointers, else through the 1x indirect pointer and so forth.
- */
-#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
-#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
-
-/*
- * Each indirect block pointer must have this flag set, if all block pointers
- * behind it are set, i.e. there is no hole hidden in the shadow of this
- * indirect block pointer.
- */
-#define LOGFS_FULLY_POPULATED (1ULL << 63)
-#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
-
-/*
- * LogFS needs to separate data into levels. Each level is defined as the
- * maximal possible distance from the master inode (inode of the inode file).
- * Data blocks reside on level 0, 1x indirect block on level 1, etc.
- * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
- * This effort is necessary to guarantee garbage collection to always make
- * progress.
- *
- * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
- * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
- * the maximal number of levels for one file.
- * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
- * effectively stacked on top of each other.
- */
-#define LOGFS_MAX_INDIRECT (5)
-#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
-#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
-
-/* Maximum size of filenames */
-#define LOGFS_MAX_NAMELEN (255)
-
-/* Number of segments in the primary journal. */
-#define LOGFS_JOURNAL_SEGS (16)
-
-/* Maximum number of free/erased/etc. segments in journal entries */
-#define MAX_CACHED_SEGS (64)
-
-
-/*
- * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
- * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
- * its header,
- * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
- * its segment header and the padded space at the end when no further objects
- * fit.
- */
-#define LOGFS_OBJECT_HEADERSIZE (0x1c)
-#define LOGFS_SEGMENT_HEADERSIZE (0x18)
-#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
-#define LOGFS_SEGMENT_RESERVE \
- (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
-
-/*
- * Segment types:
- * SEG_SUPER - Data or indirect block
- * SEG_JOURNAL - Inode
- * SEG_OSTORE - Dentry
- */
-enum {
- SEG_SUPER = 0x01,
- SEG_JOURNAL = 0x02,
- SEG_OSTORE = 0x03,
-};
-
-/**
- * struct logfs_segment_header - per-segment header in the ostore
- *
- * @crc: crc32 of header (there is no data)
- * @pad: unused, must be 0
- * @type: segment type, see above
- * @level: GC level for all objects in this segment
- * @segno: segment number
- * @ec: erase count for this segment
- * @gec: global erase count at time of writing
- */
-struct logfs_segment_header {
- __be32 crc;
- __be16 pad;
- __u8 type;
- __u8 level;
- __be32 segno;
- __be32 ec;
- __be64 gec;
-};
-
-SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
-
-#define LOGFS_FEATURES_INCOMPAT (0ull)
-#define LOGFS_FEATURES_RO_COMPAT (0ull)
-#define LOGFS_FEATURES_COMPAT (0ull)
-
-/**
- * struct logfs_disk_super - on-medium superblock
- *
- * @ds_magic: magic number, must equal LOGFS_MAGIC
- * @ds_crc: crc32 of structure starting with the next field
- * @ds_ifile_levels: maximum number of levels for ifile
- * @ds_iblock_levels: maximum number of levels for regular files
- * @ds_data_levels: number of separate levels for data
- * @pad0: reserved, must be 0
- * @ds_feature_incompat: incompatible filesystem features
- * @ds_feature_ro_compat: read-only compatible filesystem features
- * @ds_feature_compat: compatible filesystem features
- * @ds_flags: flags
- * @ds_segment_shift: log2 of segment size
- * @ds_block_shift: log2 of block size
- * @ds_write_shift: log2 of write size
- * @pad1: reserved, must be 0
- * @ds_journal_seg: segments used by primary journal
- * @ds_root_reserve: bytes reserved for the superuser
- * @ds_speed_reserve: bytes reserved to speed up GC
- * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
- * @pad2: reserved, must be 0
- * @pad3: reserved, must be 0
- *
- * Contains only read-only fields. Read-write fields like the amount of used
- * space is tracked in the dynamic superblock, which is stored in the journal.
- */
-struct logfs_disk_super {
- struct logfs_segment_header ds_sh;
- __be64 ds_magic;
-
- __be32 ds_crc;
- __u8 ds_ifile_levels;
- __u8 ds_iblock_levels;
- __u8 ds_data_levels;
- __u8 ds_segment_shift;
- __u8 ds_block_shift;
- __u8 ds_write_shift;
- __u8 pad0[6];
-
- __be64 ds_filesystem_size;
- __be32 ds_segment_size;
- __be32 ds_bad_seg_reserve;
-
- __be64 ds_feature_incompat;
- __be64 ds_feature_ro_compat;
-
- __be64 ds_feature_compat;
- __be64 ds_feature_flags;
-
- __be64 ds_root_reserve;
- __be64 ds_speed_reserve;
-
- __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
-
- __be64 ds_super_ofs[2];
- __be64 pad3[8];
-};
-
-SIZE_CHECK(logfs_disk_super, 256);
-
-/*
- * Object types:
- * OBJ_BLOCK - Data or indirect block
- * OBJ_INODE - Inode
- * OBJ_DENTRY - Dentry
- */
-enum {
- OBJ_BLOCK = 0x04,
- OBJ_INODE = 0x05,
- OBJ_DENTRY = 0x06,
-};
-
-/**
- * struct logfs_object_header - per-object header in the ostore
- *
- * @crc: crc32 of header, excluding data_crc
- * @len: length of data
- * @type: object type, see above
- * @compr: compression type
- * @ino: inode number
- * @bix: block index
- * @data_crc: crc32 of payload
- */
-struct logfs_object_header {
- __be32 crc;
- __be16 len;
- __u8 type;
- __u8 compr;
- __be64 ino;
- __be64 bix;
- __be32 data_crc;
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
-
-/*
- * Reserved inode numbers:
- * LOGFS_INO_MASTER - master inode (for inode file)
- * LOGFS_INO_ROOT - root directory
- * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
- */
-enum {
- LOGFS_INO_MAPPING = 0x00,
- LOGFS_INO_MASTER = 0x01,
- LOGFS_INO_ROOT = 0x02,
- LOGFS_INO_SEGFILE = 0x03,
- LOGFS_RESERVED_INOS = 0x10,
-};
-
-/*
- * Inode flags. High bits should never be written to the medium. They are
- * reserved for in-memory usage.
- * Low bits should either remain in sync with the corresponding FS_*_FL or
- * reuse slots that obviously don't make sense for logfs.
- *
- * LOGFS_IF_DIRTY Inode must be written back
- * LOGFS_IF_ZOMBIE Inode has been deleted
- * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
- */
-#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
-#define LOGFS_IF_DIRTY 0x20000000
-#define LOGFS_IF_ZOMBIE 0x40000000
-#define LOGFS_IF_STILLBORN 0x80000000
-
-/* Flags available to chattr */
-#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
-#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
-/* Flags inherited from parent directory on file/directory creation */
-#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
-
-/**
- * struct logfs_disk_inode - on-medium inode
- *
- * @di_mode: file mode
- * @di_pad: reserved, must be 0
- * @di_flags: inode flags, see above
- * @di_uid: user id
- * @di_gid: group id
- * @di_ctime: change time
- * @di_mtime: modify time
- * @di_refcount: reference count (aka nlink or link count)
- * @di_generation: inode generation, for nfs
- * @di_used_bytes: number of bytes used
- * @di_size: file size
- * @di_data: data pointers
- */
-struct logfs_disk_inode {
- __be16 di_mode;
- __u8 di_height;
- __u8 di_pad;
- __be32 di_flags;
- __be32 di_uid;
- __be32 di_gid;
-
- __be64 di_ctime;
- __be64 di_mtime;
-
- __be64 di_atime;
- __be32 di_refcount;
- __be32 di_generation;
-
- __be64 di_used_bytes;
- __be64 di_size;
-
- __be64 di_data[LOGFS_EMBEDDED_FIELDS];
-};
-
-SIZE_CHECK(logfs_disk_inode, 200);
-
-#define INODE_POINTER_OFS \
- (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
-#define INODE_USED_OFS \
- (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
-#define INODE_SIZE_OFS \
- (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
-#define INODE_HEIGHT_OFS (0)
-
-/**
- * struct logfs_disk_dentry - on-medium dentry structure
- *
- * @ino: inode number
- * @namelen: length of file name
- * @type: file type, identical to bits 12..15 of mode
- * @name: file name
- */
-/* FIXME: add 6 bytes of padding to remove the __packed */
-struct logfs_disk_dentry {
- __be64 ino;
- __be16 namelen;
- __u8 type;
- __u8 name[LOGFS_MAX_NAMELEN];
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_disk_dentry, 266);
-
-#define RESERVED 0xffffffff
-#define BADSEG 0xffffffff
-/**
- * struct logfs_segment_entry - segment file entry
- *
- * @ec_level: erase count and level
- * @valid: number of valid bytes
- *
- * Segment file contains one entry for every segment. ec_level contains the
- * erasecount in the upper 28 bits and the level in the lower 4 bits. An
- * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
- * of valid bytes or RESERVED (-1 again) if the segment is used for either the
- * superblock or the journal, or when the segment is bad.
- */
-struct logfs_segment_entry {
- __be32 ec_level;
- __be32 valid;
-};
-
-SIZE_CHECK(logfs_segment_entry, 8);
-
-/**
- * struct logfs_journal_header - header for journal entries (JEs)
- *
- * @h_crc: crc32 of journal entry
- * @h_len: length of compressed journal entry,
- * not including header
- * @h_datalen: length of uncompressed data
- * @h_type: JE type
- * @h_compr: compression type
- * @h_pad: reserved
- */
-struct logfs_journal_header {
- __be32 h_crc;
- __be16 h_len;
- __be16 h_datalen;
- __be16 h_type;
- __u8 h_compr;
- __u8 h_pad[5];
-};
-
-SIZE_CHECK(logfs_journal_header, 16);
-
-/*
- * Life expectency of data.
- * VIM_DEFAULT - default vim
- * VIM_SEGFILE - for segment file only - very short-living
- * VIM_GC - GC'd data - likely long-living
- */
-enum logfs_vim {
- VIM_DEFAULT = 0,
- VIM_SEGFILE = 1,
-};
-
-/**
- * struct logfs_je_area - wbuf header
- *
- * @segno: segment number of area
- * @used_bytes: number of bytes already used
- * @gc_level: GC level
- * @vim: life expectancy of data
- *
- * "Areas" are segments currently being used for writing. There is at least
- * one area per GC level. Several may be used to separate long-living from
- * short-living data. If an area with unknown vim is encountered, it can
- * simply be closed.
- * The write buffer immediately follow this header.
- */
-struct logfs_je_area {
- __be32 segno;
- __be32 used_bytes;
- __u8 gc_level;
- __u8 vim;
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_je_area, 10);
-
-#define MAX_JOURNAL_HEADER \
- (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
-
-/**
- * struct logfs_je_dynsb - dynamic superblock
- *
- * @ds_gec: global erase count
- * @ds_sweeper: current position of GC "sweeper"
- * @ds_rename_dir: source directory ino (see dir.c documentation)
- * @ds_rename_pos: position of source dd (see dir.c documentation)
- * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
- * @ds_victim_ino: parent inode of victim (see dir.c)
- * @ds_used_bytes: number of used bytes
- */
-struct logfs_je_dynsb {
- __be64 ds_gec;
- __be64 ds_sweeper;
-
- __be64 ds_rename_dir;
- __be64 ds_rename_pos;
-
- __be64 ds_victim_ino;
- __be64 ds_victim_parent; /* XXX */
-
- __be64 ds_used_bytes;
- __be32 ds_generation;
- __be32 pad;
-};
-
-SIZE_CHECK(logfs_je_dynsb, 64);
-
-/**
- * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
- *
- * @da_size: size of inode file
- * @da_last_ino: last created inode
- * @da_used_bytes: number of bytes used
- * @da_data: data pointers
- */
-struct logfs_je_anchor {
- __be64 da_size;
- __be64 da_last_ino;
-
- __be64 da_used_bytes;
- u8 da_height;
- u8 pad[7];
-
- __be64 da_data[LOGFS_EMBEDDED_FIELDS];
-};
-
-SIZE_CHECK(logfs_je_anchor, 168);
-
-/**
- * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
- *
- * @so_segment: segments used for 2nd journal
- *
- * Length of the array is given by h_len field in the header.
- */
-struct logfs_je_spillout {
- __be64 so_segment[0];
-};
-
-SIZE_CHECK(logfs_je_spillout, 0);
-
-/**
- * struct logfs_je_journal_ec - erase counts for all journal segments
- *
- * @ec: erase count
- *
- * Length of the array is given by h_len field in the header.
- */
-struct logfs_je_journal_ec {
- __be32 ec[0];
-};
-
-SIZE_CHECK(logfs_je_journal_ec, 0);
-
-/**
- * struct logfs_je_free_segments - list of free segmetns with erase count
- */
-struct logfs_je_free_segments {
- __be32 segno;
- __be32 ec;
-};
-
-SIZE_CHECK(logfs_je_free_segments, 8);
-
-/**
- * struct logfs_seg_alias - list of segment aliases
- */
-struct logfs_seg_alias {
- __be32 old_segno;
- __be32 new_segno;
-};
-
-SIZE_CHECK(logfs_seg_alias, 8);
-
-/**
- * struct logfs_obj_alias - list of object aliases
- */
-struct logfs_obj_alias {
- __be64 ino;
- __be64 bix;
- __be64 val;
- u8 level;
- u8 pad[5];
- __be16 child_no;
-};
-
-SIZE_CHECK(logfs_obj_alias, 32);
-
-/**
- * Compression types.
- *
- * COMPR_NONE - uncompressed
- * COMPR_ZLIB - compressed with zlib
- */
-enum {
- COMPR_NONE = 0,
- COMPR_ZLIB = 1,
-};
-
-/*
- * Journal entries come in groups of 16. First group contains unique
- * entries, next groups contain one entry per level
- *
- * JE_FIRST - smallest possible journal entry number
- *
- * JEG_BASE - base group, containing unique entries
- * JE_COMMIT - commit entry, validates all previous entries
- * JE_DYNSB - dynamic superblock, anything that ought to be in the
- * superblock but cannot because it is read-write data
- * JE_ANCHOR - anchor aka master inode aka inode file's inode
- * JE_ERASECOUNT erasecounts for all journal segments
- * JE_SPILLOUT - unused
- * JE_SEG_ALIAS - aliases segments
- * JE_AREA - area description
- *
- * JE_LAST - largest possible journal entry number
- */
-enum {
- JE_FIRST = 0x01,
-
- JEG_BASE = 0x00,
- JE_COMMIT = 0x02,
- JE_DYNSB = 0x03,
- JE_ANCHOR = 0x04,
- JE_ERASECOUNT = 0x05,
- JE_SPILLOUT = 0x06,
- JE_OBJ_ALIAS = 0x0d,
- JE_AREA = 0x0e,
-
- JE_LAST = 0x0e,
-};
-
-#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
deleted file mode 100644
index bf19bf4a243f..000000000000
--- a/fs/logfs/readwrite.c
+++ /dev/null
@@ -1,2298 +0,0 @@
-/*
- * fs/logfs/readwrite.c
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- *
- * Actually contains five sets of very similar functions:
- * read read blocks from a file
- * seek_hole find next hole
- * seek_data find next data block
- * valid check whether a block still belongs to a file
- * write write blocks to a file
- * delete delete a block (for directories and ifile)
- * rewrite move existing blocks of a file to a new location (gc helper)
- * truncate truncate a file
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static u64 adjust_bix(u64 bix, level_t level)
-{
- switch (level) {
- case 0:
- return bix;
- case LEVEL(1):
- return max_t(u64, bix, I0_BLOCKS);
- case LEVEL(2):
- return max_t(u64, bix, I1_BLOCKS);
- case LEVEL(3):
- return max_t(u64, bix, I2_BLOCKS);
- case LEVEL(4):
- return max_t(u64, bix, I3_BLOCKS);
- case LEVEL(5):
- return max_t(u64, bix, I4_BLOCKS);
- default:
- WARN_ON(1);
- return bix;
- }
-}
-
-static inline u64 maxbix(u8 height)
-{
- return 1ULL << (LOGFS_BLOCK_BITS * height);
-}
-
-/**
- * The inode address space is cut in two halves. Lower half belongs to data
- * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
- * set, the actual block index (bix) and level can be derived from the page
- * index.
- *
- * The lowest three bits of the block index are set to 0 after packing and
- * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
- * anyway this is harmless.
- */
-#define ARCH_SHIFT (BITS_PER_LONG - 32)
-#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
-#define LEVEL_SHIFT (28 + ARCH_SHIFT)
-static inline pgoff_t first_indirect_block(void)
-{
- return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
-}
-
-pgoff_t logfs_pack_index(u64 bix, level_t level)
-{
- pgoff_t index;
-
- BUG_ON(bix >= INDIRECT_BIT);
- if (level == 0)
- return bix;
-
- index = INDIRECT_BIT;
- index |= (__force long)level << LEVEL_SHIFT;
- index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
- return index;
-}
-
-void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
-{
- u8 __level;
-
- if (!(index & INDIRECT_BIT)) {
- *bix = index;
- *level = 0;
- return;
- }
-
- __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
- *level = LEVEL(__level);
- *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
- *bix = adjust_bix(*bix, *level);
- return;
-}
-#undef ARCH_SHIFT
-#undef INDIRECT_BIT
-#undef LEVEL_SHIFT
-
-/*
- * Time is stored as nanoseconds since the epoch.
- */
-static struct timespec be64_to_timespec(__be64 betime)
-{
- return ns_to_timespec(be64_to_cpu(betime));
-}
-
-static __be64 timespec_to_be64(struct timespec tsp)
-{
- return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
-}
-
-static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- int i;
-
- inode->i_mode = be16_to_cpu(di->di_mode);
- li->li_height = di->di_height;
- li->li_flags = be32_to_cpu(di->di_flags);
- i_uid_write(inode, be32_to_cpu(di->di_uid));
- i_gid_write(inode, be32_to_cpu(di->di_gid));
- inode->i_size = be64_to_cpu(di->di_size);
- logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
- inode->i_atime = be64_to_timespec(di->di_atime);
- inode->i_ctime = be64_to_timespec(di->di_ctime);
- inode->i_mtime = be64_to_timespec(di->di_mtime);
- set_nlink(inode, be32_to_cpu(di->di_refcount));
- inode->i_generation = be32_to_cpu(di->di_generation);
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFSOCK: /* fall through */
- case S_IFBLK: /* fall through */
- case S_IFCHR: /* fall through */
- case S_IFIFO:
- inode->i_rdev = be64_to_cpu(di->di_data[0]);
- break;
- case S_IFDIR: /* fall through */
- case S_IFREG: /* fall through */
- case S_IFLNK:
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- li->li_data[i] = be64_to_cpu(di->di_data[i]);
- break;
- default:
- BUG();
- }
-}
-
-static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
-{
- struct logfs_inode *li = logfs_inode(inode);
- int i;
-
- di->di_mode = cpu_to_be16(inode->i_mode);
- di->di_height = li->li_height;
- di->di_pad = 0;
- di->di_flags = cpu_to_be32(li->li_flags);
- di->di_uid = cpu_to_be32(i_uid_read(inode));
- di->di_gid = cpu_to_be32(i_gid_read(inode));
- di->di_size = cpu_to_be64(i_size_read(inode));
- di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
- di->di_atime = timespec_to_be64(inode->i_atime);
- di->di_ctime = timespec_to_be64(inode->i_ctime);
- di->di_mtime = timespec_to_be64(inode->i_mtime);
- di->di_refcount = cpu_to_be32(inode->i_nlink);
- di->di_generation = cpu_to_be32(inode->i_generation);
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFSOCK: /* fall through */
- case S_IFBLK: /* fall through */
- case S_IFCHR: /* fall through */
- case S_IFIFO:
- di->di_data[0] = cpu_to_be64(inode->i_rdev);
- break;
- case S_IFDIR: /* fall through */
- case S_IFREG: /* fall through */
- case S_IFLNK:
- for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
- di->di_data[i] = cpu_to_be64(li->li_data[i]);
- break;
- default:
- BUG();
- }
-}
-
-static void __logfs_set_blocks(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_inode *li = logfs_inode(inode);
-
- inode->i_blocks = ULONG_MAX;
- if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
- inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
-}
-
-void logfs_set_blocks(struct inode *inode, u64 bytes)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- li->li_used_bytes = bytes;
- __logfs_set_blocks(inode);
-}
-
-static void prelock_page(struct super_block *sb, struct page *page, int lock)
-{
- struct logfs_super *super = logfs_super(sb);
-
- BUG_ON(!PageLocked(page));
- if (lock) {
- BUG_ON(PagePreLocked(page));
- SetPagePreLocked(page);
- } else {
- /* We are in GC path. */
- if (PagePreLocked(page))
- super->s_lock_count++;
- else
- SetPagePreLocked(page);
- }
-}
-
-static void preunlock_page(struct super_block *sb, struct page *page, int lock)
-{
- struct logfs_super *super = logfs_super(sb);
-
- BUG_ON(!PageLocked(page));
- if (lock)
- ClearPagePreLocked(page);
- else {
- /* We are in GC path. */
- BUG_ON(!PagePreLocked(page));
- if (super->s_lock_count)
- super->s_lock_count--;
- else
- ClearPagePreLocked(page);
- }
-}
-
-/*
- * Logfs is prone to an AB-BA deadlock where one task tries to acquire
- * s_write_mutex with a locked page and GC tries to get that page while holding
- * s_write_mutex.
- * To solve this issue logfs will ignore the page lock iff the page in question
- * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
- * in addition to PG_locked.
- */
-void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
-{
- struct logfs_super *super = logfs_super(sb);
-
- if (page)
- prelock_page(sb, page, lock);
-
- if (lock) {
- mutex_lock(&super->s_write_mutex);
- logfs_gc_pass(sb);
- /* FIXME: We also have to check for shadowed space
- * and mempool fill grade */
- }
-}
-
-void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
-{
- struct logfs_super *super = logfs_super(sb);
-
- if (page)
- preunlock_page(sb, page, lock);
- /* Order matters - we must clear PG_pre_locked before releasing
- * s_write_mutex or we could race against another task. */
- if (lock)
- mutex_unlock(&super->s_write_mutex);
-}
-
-static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
- level_t level)
-{
- return find_or_create_page(inode->i_mapping,
- logfs_pack_index(bix, level), GFP_NOFS);
-}
-
-static void logfs_put_read_page(struct page *page)
-{
- unlock_page(page);
- put_page(page);
-}
-
-static void logfs_lock_write_page(struct page *page)
-{
- int loop = 0;
-
- while (unlikely(!trylock_page(page))) {
- if (loop++ > 0x1000) {
- /* Has been observed once so far... */
- printk(KERN_ERR "stack at %p\n", &loop);
- BUG();
- }
- if (PagePreLocked(page)) {
- /* Holder of page lock is waiting for us, it
- * is safe to use this page. */
- break;
- }
- /* Some other process has this page locked and has
- * nothing to do with us. Wait for it to finish.
- */
- schedule();
- }
- BUG_ON(!PageLocked(page));
-}
-
-static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
- level_t level)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index = logfs_pack_index(bix, level);
- struct page *page;
- int err;
-
-repeat:
- page = find_get_page(mapping, index);
- if (!page) {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page)
- return NULL;
- err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
- if (unlikely(err)) {
- put_page(page);
- if (err == -EEXIST)
- goto repeat;
- return NULL;
- }
- } else logfs_lock_write_page(page);
- BUG_ON(!PageLocked(page));
- return page;
-}
-
-static void logfs_unlock_write_page(struct page *page)
-{
- if (!PagePreLocked(page))
- unlock_page(page);
-}
-
-static void logfs_put_write_page(struct page *page)
-{
- logfs_unlock_write_page(page);
- put_page(page);
-}
-
-static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
- int rw)
-{
- if (rw == READ)
- return logfs_get_read_page(inode, bix, level);
- else
- return logfs_get_write_page(inode, bix, level);
-}
-
-static void logfs_put_page(struct page *page, int rw)
-{
- if (rw == READ)
- logfs_put_read_page(page);
- else
- logfs_put_write_page(page);
-}
-
-static unsigned long __get_bits(u64 val, int skip, int no)
-{
- u64 ret = val;
-
- ret >>= skip * no;
- ret <<= 64 - no;
- ret >>= 64 - no;
- return ret;
-}
-
-static unsigned long get_bits(u64 val, level_t skip)
-{
- return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
-}
-
-static inline void init_shadow_tree(struct super_block *sb,
- struct shadow_tree *tree)
-{
- struct logfs_super *super = logfs_super(sb);
-
- btree_init_mempool64(&tree->new, super->s_btree_pool);
- btree_init_mempool64(&tree->old, super->s_btree_pool);
-}
-
-static void indirect_write_block(struct logfs_block *block)
-{
- struct page *page;
- struct inode *inode;
- int ret;
-
- page = block->page;
- inode = page->mapping->host;
- logfs_lock_write_page(page);
- ret = logfs_write_buf(inode, page, 0);
- logfs_unlock_write_page(page);
- /*
- * This needs some rework. Unless you want your filesystem to run
- * completely synchronously (you don't), the filesystem will always
- * report writes as 'successful' before the actual work has been
- * done. The actual work gets done here and this is where any errors
- * will show up. And there isn't much we can do about it, really.
- *
- * Some attempts to fix the errors (move from bad blocks, retry io,...)
- * have already been done, so anything left should be either a broken
- * device or a bug somewhere in logfs itself. Being relatively new,
- * the odds currently favor a bug, so for now the line below isn't
- * entirely tasteles.
- */
- BUG_ON(ret);
-}
-
-static void inode_write_block(struct logfs_block *block)
-{
- struct inode *inode;
- int ret;
-
- inode = block->inode;
- if (inode->i_ino == LOGFS_INO_MASTER)
- logfs_write_anchor(inode->i_sb);
- else {
- ret = __logfs_write_inode(inode, NULL, 0);
- /* see indirect_write_block comment */
- BUG_ON(ret);
- }
-}
-
-/*
- * This silences a false, yet annoying gcc warning. I hate it when my editor
- * jumps into bitops.h each time I recompile this file.
- * TODO: Complain to gcc folks about this and upgrade compiler.
- */
-static unsigned long fnb(const unsigned long *addr,
- unsigned long size, unsigned long offset)
-{
- return find_next_bit(addr, size, offset);
-}
-
-static __be64 inode_val0(struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- u64 val;
-
- /*
- * Explicit shifting generates good code, but must match the format
- * of the structure. Add some paranoia just in case.
- */
- BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
- BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
- BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
-
- val = (u64)inode->i_mode << 48 |
- (u64)li->li_height << 40 |
- (u64)li->li_flags;
- return cpu_to_be64(val);
-}
-
-static int inode_write_alias(struct super_block *sb,
- struct logfs_block *block, write_alias_t *write_one_alias)
-{
- struct inode *inode = block->inode;
- struct logfs_inode *li = logfs_inode(inode);
- unsigned long pos;
- u64 ino , bix;
- __be64 val;
- level_t level;
- int err;
-
- for (pos = 0; ; pos++) {
- pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
- if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
- return 0;
-
- switch (pos) {
- case INODE_HEIGHT_OFS:
- val = inode_val0(inode);
- break;
- case INODE_USED_OFS:
- val = cpu_to_be64(li->li_used_bytes);
- break;
- case INODE_SIZE_OFS:
- val = cpu_to_be64(i_size_read(inode));
- break;
- case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
- val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
- break;
- default:
- BUG();
- }
-
- ino = LOGFS_INO_MASTER;
- bix = inode->i_ino;
- level = LEVEL(0);
- err = write_one_alias(sb, ino, bix, level, pos, val);
- if (err)
- return err;
- }
-}
-
-static int indirect_write_alias(struct super_block *sb,
- struct logfs_block *block, write_alias_t *write_one_alias)
-{
- unsigned long pos;
- struct page *page = block->page;
- u64 ino , bix;
- __be64 *child, val;
- level_t level;
- int err;
-
- for (pos = 0; ; pos++) {
- pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
- if (pos >= LOGFS_BLOCK_FACTOR)
- return 0;
-
- ino = page->mapping->host->i_ino;
- logfs_unpack_index(page->index, &bix, &level);
- child = kmap_atomic(page);
- val = child[pos];
- kunmap_atomic(child);
- err = write_one_alias(sb, ino, bix, level, pos, val);
- if (err)
- return err;
- }
-}
-
-int logfs_write_obj_aliases_pagecache(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_block *block;
- int err;
-
- list_for_each_entry(block, &super->s_object_alias, alias_list) {
- err = block->ops->write_alias(sb, block, write_alias_journal);
- if (err)
- return err;
- }
- return 0;
-}
-
-void __free_block(struct super_block *sb, struct logfs_block *block)
-{
- BUG_ON(!list_empty(&block->item_list));
- list_del(&block->alias_list);
- mempool_free(block, logfs_super(sb)->s_block_pool);
-}
-
-static void inode_free_block(struct super_block *sb, struct logfs_block *block)
-{
- struct inode *inode = block->inode;
-
- logfs_inode(inode)->li_block = NULL;
- __free_block(sb, block);
-}
-
-static void indirect_free_block(struct super_block *sb,
- struct logfs_block *block)
-{
- struct page *page = block->page;
-
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- put_page(page);
- set_page_private(page, 0);
- }
- __free_block(sb, block);
-}
-
-
-static const struct logfs_block_ops inode_block_ops = {
- .write_block = inode_write_block,
- .free_block = inode_free_block,
- .write_alias = inode_write_alias,
-};
-
-const struct logfs_block_ops indirect_block_ops = {
- .write_block = indirect_write_block,
- .free_block = indirect_free_block,
- .write_alias = indirect_write_alias,
-};
-
-struct logfs_block *__alloc_block(struct super_block *sb,
- u64 ino, u64 bix, level_t level)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_block *block;
-
- block = mempool_alloc(super->s_block_pool, GFP_NOFS);
- memset(block, 0, sizeof(*block));
- INIT_LIST_HEAD(&block->alias_list);
- INIT_LIST_HEAD(&block->item_list);
- block->sb = sb;
- block->ino = ino;
- block->bix = bix;
- block->level = level;
- return block;
-}
-
-static void alloc_inode_block(struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct logfs_block *block;
-
- if (li->li_block)
- return;
-
- block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
- block->inode = inode;
- li->li_block = block;
- block->ops = &inode_block_ops;
-}
-
-void initialize_block_counters(struct page *page, struct logfs_block *block,
- __be64 *array, int page_is_empty)
-{
- u64 ptr;
- int i, start;
-
- block->partial = 0;
- block->full = 0;
- start = 0;
- if (page->index < first_indirect_block()) {
- /* Counters are pointless on level 0 */
- return;
- }
- if (page->index == first_indirect_block()) {
- /* Skip unused pointers */
- start = I0_BLOCKS;
- block->full = I0_BLOCKS;
- }
- if (!page_is_empty) {
- for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
- ptr = be64_to_cpu(array[i]);
- if (ptr)
- block->partial++;
- if (ptr & LOGFS_FULLY_POPULATED)
- block->full++;
- }
- }
-}
-
-static void alloc_data_block(struct inode *inode, struct page *page)
-{
- struct logfs_block *block;
- u64 bix;
- level_t level;
-
- if (PagePrivate(page))
- return;
-
- logfs_unpack_index(page->index, &bix, &level);
- block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
- block->page = page;
-
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long) block);
-
- block->ops = &indirect_block_ops;
-}
-
-static void alloc_indirect_block(struct inode *inode, struct page *page,
- int page_is_empty)
-{
- struct logfs_block *block;
- __be64 *array;
-
- if (PagePrivate(page))
- return;
-
- alloc_data_block(inode, page);
-
- block = logfs_block(page);
- array = kmap_atomic(page);
- initialize_block_counters(page, block, array, page_is_empty);
- kunmap_atomic(array);
-}
-
-static void block_set_pointer(struct page *page, int index, u64 ptr)
-{
- struct logfs_block *block = logfs_block(page);
- __be64 *array;
- u64 oldptr;
-
- BUG_ON(!block);
- array = kmap_atomic(page);
- oldptr = be64_to_cpu(array[index]);
- array[index] = cpu_to_be64(ptr);
- kunmap_atomic(array);
- SetPageUptodate(page);
-
- block->full += !!(ptr & LOGFS_FULLY_POPULATED)
- - !!(oldptr & LOGFS_FULLY_POPULATED);
- block->partial += !!ptr - !!oldptr;
-}
-
-static u64 block_get_pointer(struct page *page, int index)
-{
- __be64 *block;
- u64 ptr;
-
- block = kmap_atomic(page);
- ptr = be64_to_cpu(block[index]);
- kunmap_atomic(block);
- return ptr;
-}
-
-static int logfs_read_empty(struct page *page)
-{
- zero_user_segment(page, 0, PAGE_SIZE);
- return 0;
-}
-
-static int logfs_read_direct(struct inode *inode, struct page *page)
-{
- struct logfs_inode *li = logfs_inode(inode);
- pgoff_t index = page->index;
- u64 block;
-
- block = li->li_data[index];
- if (!block)
- return logfs_read_empty(page);
-
- return logfs_segment_read(inode, page, block, index, 0);
-}
-
-static int logfs_read_loop(struct inode *inode, struct page *page,
- int rw_context)
-{
- struct logfs_inode *li = logfs_inode(inode);
- u64 bix, bofs = li->li_data[INDIRECT_INDEX];
- level_t level, target_level;
- int ret;
- struct page *ipage;
-
- logfs_unpack_index(page->index, &bix, &target_level);
- if (!bofs)
- return logfs_read_empty(page);
-
- if (bix >= maxbix(li->li_height))
- return logfs_read_empty(page);
-
- for (level = LEVEL(li->li_height);
- (__force u8)level > (__force u8)target_level;
- level = SUBLEVEL(level)){
- ipage = logfs_get_page(inode, bix, level, rw_context);
- if (!ipage)
- return -ENOMEM;
-
- ret = logfs_segment_read(inode, ipage, bofs, bix, level);
- if (ret) {
- logfs_put_read_page(ipage);
- return ret;
- }
-
- bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
- logfs_put_page(ipage, rw_context);
- if (!bofs)
- return logfs_read_empty(page);
- }
-
- return logfs_segment_read(inode, page, bofs, bix, 0);
-}
-
-static int logfs_read_block(struct inode *inode, struct page *page,
- int rw_context)
-{
- pgoff_t index = page->index;
-
- if (index < I0_BLOCKS)
- return logfs_read_direct(inode, page);
- return logfs_read_loop(inode, page, rw_context);
-}
-
-static int logfs_exist_loop(struct inode *inode, u64 bix)
-{
- struct logfs_inode *li = logfs_inode(inode);
- u64 bofs = li->li_data[INDIRECT_INDEX];
- level_t level;
- int ret;
- struct page *ipage;
-
- if (!bofs)
- return 0;
- if (bix >= maxbix(li->li_height))
- return 0;
-
- for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
- ipage = logfs_get_read_page(inode, bix, level);
- if (!ipage)
- return -ENOMEM;
-
- ret = logfs_segment_read(inode, ipage, bofs, bix, level);
- if (ret) {
- logfs_put_read_page(ipage);
- return ret;
- }
-
- bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
- logfs_put_read_page(ipage);
- if (!bofs)
- return 0;
- }
-
- return 1;
-}
-
-int logfs_exist_block(struct inode *inode, u64 bix)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if (bix < I0_BLOCKS)
- return !!li->li_data[bix];
- return logfs_exist_loop(inode, bix);
-}
-
-static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- for (; bix < I0_BLOCKS; bix++)
- if (data ^ (li->li_data[bix] == 0))
- return bix;
- return I0_BLOCKS;
-}
-
-static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
-{
- struct logfs_inode *li = logfs_inode(inode);
- __be64 *rblock;
- u64 increment, bofs = li->li_data[INDIRECT_INDEX];
- level_t level;
- int ret, slot;
- struct page *page;
-
- BUG_ON(!bofs);
-
- for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
- increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
- page = logfs_get_read_page(inode, bix, level);
- if (!page)
- return bix;
-
- ret = logfs_segment_read(inode, page, bofs, bix, level);
- if (ret) {
- logfs_put_read_page(page);
- return bix;
- }
-
- slot = get_bits(bix, SUBLEVEL(level));
- rblock = kmap_atomic(page);
- while (slot < LOGFS_BLOCK_FACTOR) {
- if (data && (rblock[slot] != 0))
- break;
- if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
- break;
- slot++;
- bix += increment;
- bix &= ~(increment - 1);
- }
- if (slot >= LOGFS_BLOCK_FACTOR) {
- kunmap_atomic(rblock);
- logfs_put_read_page(page);
- return bix;
- }
- bofs = be64_to_cpu(rblock[slot]);
- kunmap_atomic(rblock);
- logfs_put_read_page(page);
- if (!bofs) {
- BUG_ON(data);
- return bix;
- }
- }
- return bix;
-}
-
-/**
- * logfs_seek_hole - find next hole starting at a given block index
- * @inode: inode to search in
- * @bix: block index to start searching
- *
- * Returns next hole. If the file doesn't contain any further holes, the
- * block address next to eof is returned instead.
- */
-u64 logfs_seek_hole(struct inode *inode, u64 bix)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if (bix < I0_BLOCKS) {
- bix = seek_holedata_direct(inode, bix, 0);
- if (bix < I0_BLOCKS)
- return bix;
- }
-
- if (!li->li_data[INDIRECT_INDEX])
- return bix;
- else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
- bix = maxbix(li->li_height);
- else if (bix >= maxbix(li->li_height))
- return bix;
- else {
- bix = seek_holedata_loop(inode, bix, 0);
- if (bix < maxbix(li->li_height))
- return bix;
- /* Should not happen anymore. But if some port writes semi-
- * corrupt images (as this one used to) we might run into it.
- */
- WARN_ON_ONCE(bix == maxbix(li->li_height));
- }
-
- return bix;
-}
-
-static u64 __logfs_seek_data(struct inode *inode, u64 bix)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if (bix < I0_BLOCKS) {
- bix = seek_holedata_direct(inode, bix, 1);
- if (bix < I0_BLOCKS)
- return bix;
- }
-
- if (bix < maxbix(li->li_height)) {
- if (!li->li_data[INDIRECT_INDEX])
- bix = maxbix(li->li_height);
- else
- return seek_holedata_loop(inode, bix, 1);
- }
-
- return bix;
-}
-
-/**
- * logfs_seek_data - find next data block after a given block index
- * @inode: inode to search in
- * @bix: block index to start searching
- *
- * Returns next data block. If the file doesn't contain any further data
- * blocks, the last block in the file is returned instead.
- */
-u64 logfs_seek_data(struct inode *inode, u64 bix)
-{
- struct super_block *sb = inode->i_sb;
- u64 ret, end;
-
- ret = __logfs_seek_data(inode, bix);
- end = i_size_read(inode) >> sb->s_blocksize_bits;
- if (ret >= end)
- ret = max(bix, end);
- return ret;
-}
-
-static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
-{
- return pure_ofs(li->li_data[bix]) == ofs;
-}
-
-static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
- u64 ofs, u64 bofs)
-{
- struct logfs_inode *li = logfs_inode(inode);
- level_t level;
- int ret;
- struct page *page;
-
- for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
- page = logfs_get_write_page(inode, bix, level);
- BUG_ON(!page);
-
- ret = logfs_segment_read(inode, page, bofs, bix, level);
- if (ret) {
- logfs_put_write_page(page);
- return 0;
- }
-
- bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
- logfs_put_write_page(page);
- if (!bofs)
- return 0;
-
- if (pure_ofs(bofs) == ofs)
- return 1;
- }
- return 0;
-}
-
-static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
-{
- struct logfs_inode *li = logfs_inode(inode);
- u64 bofs = li->li_data[INDIRECT_INDEX];
-
- if (!bofs)
- return 0;
-
- if (bix >= maxbix(li->li_height))
- return 0;
-
- if (pure_ofs(bofs) == ofs)
- return 1;
-
- return __logfs_is_valid_loop(inode, bix, ofs, bofs);
-}
-
-static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
- return 0;
-
- if (bix < I0_BLOCKS)
- return logfs_is_valid_direct(li, bix, ofs);
- return logfs_is_valid_loop(inode, bix, ofs);
-}
-
-/**
- * logfs_is_valid_block - check whether this block is still valid
- *
- * @sb: superblock
- * @ofs: block physical offset
- * @ino: block inode number
- * @bix: block index
- * @gc_level: block level
- *
- * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
- * become invalid once the journal is written.
- */
-int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
- gc_level_t gc_level)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode;
- int ret, cookie;
-
- /* Umount closes a segment with free blocks remaining. Those
- * blocks are by definition invalid. */
- if (ino == -1)
- return 0;
-
- LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
-
- inode = logfs_safe_iget(sb, ino, &cookie);
- if (IS_ERR(inode))
- goto invalid;
-
- ret = __logfs_is_valid_block(inode, bix, ofs);
- logfs_safe_iput(inode, cookie);
- if (ret)
- return ret;
-
-invalid:
- /* Block is nominally invalid, but may still sit in the shadow tree,
- * waiting for a journal commit.
- */
- if (btree_lookup64(&super->s_shadow_tree.old, ofs))
- return 2;
- return 0;
-}
-
-int logfs_readpage_nolock(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- int ret = -EIO;
-
- ret = logfs_read_block(inode, page, READ);
-
- if (ret) {
- ClearPageUptodate(page);
- SetPageError(page);
- } else {
- SetPageUptodate(page);
- ClearPageError(page);
- }
- flush_dcache_page(page);
-
- return ret;
-}
-
-static int logfs_reserve_bytes(struct inode *inode, int bytes)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- u64 available = super->s_free_bytes + super->s_dirty_free_bytes
- - super->s_dirty_used_bytes - super->s_dirty_pages;
-
- if (!bytes)
- return 0;
-
- if (available < bytes)
- return -ENOSPC;
-
- if (available < bytes + super->s_root_reserve &&
- !capable(CAP_SYS_RESOURCE))
- return -ENOSPC;
-
- return 0;
-}
-
-int get_page_reserve(struct inode *inode, struct page *page)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- struct logfs_block *block = logfs_block(page);
- int ret;
-
- if (block && block->reserved_bytes)
- return 0;
-
- logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
- while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
- !list_empty(&super->s_writeback_list)) {
- block = list_entry(super->s_writeback_list.next,
- struct logfs_block, alias_list);
- block->ops->write_block(block);
- }
- if (!ret) {
- alloc_data_block(inode, page);
- block = logfs_block(page);
- block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
- super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
- list_move_tail(&block->alias_list, &super->s_writeback_list);
- }
- logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
- return ret;
-}
-
-/*
- * We are protected by write lock. Push victims up to superblock level
- * and release transaction when appropriate.
- */
-/* FIXME: This is currently called from the wrong spots. */
-static void logfs_handle_transaction(struct inode *inode,
- struct logfs_transaction *ta)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
-
- if (!ta)
- return;
- logfs_inode(inode)->li_block->ta = NULL;
-
- if (inode->i_ino != LOGFS_INO_MASTER) {
- BUG(); /* FIXME: Yes, this needs more thought */
- /* just remember the transaction until inode is written */
- //BUG_ON(logfs_inode(inode)->li_transaction);
- //logfs_inode(inode)->li_transaction = ta;
- return;
- }
-
- switch (ta->state) {
- case CREATE_1: /* fall through */
- case UNLINK_1:
- BUG_ON(super->s_victim_ino);
- super->s_victim_ino = ta->ino;
- break;
- case CREATE_2: /* fall through */
- case UNLINK_2:
- BUG_ON(super->s_victim_ino != ta->ino);
- super->s_victim_ino = 0;
- /* transaction ends here - free it */
- kfree(ta);
- break;
- case CROSS_RENAME_1:
- BUG_ON(super->s_rename_dir);
- BUG_ON(super->s_rename_pos);
- super->s_rename_dir = ta->dir;
- super->s_rename_pos = ta->pos;
- break;
- case CROSS_RENAME_2:
- BUG_ON(super->s_rename_dir != ta->dir);
- BUG_ON(super->s_rename_pos != ta->pos);
- super->s_rename_dir = 0;
- super->s_rename_pos = 0;
- kfree(ta);
- break;
- case TARGET_RENAME_1:
- BUG_ON(super->s_rename_dir);
- BUG_ON(super->s_rename_pos);
- BUG_ON(super->s_victim_ino);
- super->s_rename_dir = ta->dir;
- super->s_rename_pos = ta->pos;
- super->s_victim_ino = ta->ino;
- break;
- case TARGET_RENAME_2:
- BUG_ON(super->s_rename_dir != ta->dir);
- BUG_ON(super->s_rename_pos != ta->pos);
- BUG_ON(super->s_victim_ino != ta->ino);
- super->s_rename_dir = 0;
- super->s_rename_pos = 0;
- break;
- case TARGET_RENAME_3:
- BUG_ON(super->s_rename_dir);
- BUG_ON(super->s_rename_pos);
- BUG_ON(super->s_victim_ino != ta->ino);
- super->s_victim_ino = 0;
- kfree(ta);
- break;
- default:
- BUG();
- }
-}
-
-/*
- * Not strictly a reservation, but rather a check that we still have enough
- * space to satisfy the write.
- */
-static int logfs_reserve_blocks(struct inode *inode, int blocks)
-{
- return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
-}
-
-struct write_control {
- u64 ofs;
- long flags;
-};
-
-static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
- level_t level, u64 old_ofs)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- struct logfs_shadow *shadow;
-
- shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
- memset(shadow, 0, sizeof(*shadow));
- shadow->ino = inode->i_ino;
- shadow->bix = bix;
- shadow->gc_level = expand_level(inode->i_ino, level);
- shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
- return shadow;
-}
-
-static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
-
- mempool_free(shadow, super->s_shadow_pool);
-}
-
-static void mark_segment(struct shadow_tree *tree, u32 segno)
-{
- int err;
-
- if (!btree_lookup32(&tree->segment_map, segno)) {
- err = btree_insert32(&tree->segment_map, segno, (void *)1,
- GFP_NOFS);
- BUG_ON(err);
- tree->no_shadowed_segments++;
- }
-}
-
-/**
- * fill_shadow_tree - Propagate shadow tree changes due to a write
- * @inode: Inode owning the page
- * @page: Struct page that was written
- * @shadow: Shadow for the current write
- *
- * Writes in logfs can result in two semi-valid objects. The old object
- * is still valid as long as it can be reached by following pointers on
- * the medium. Only when writes propagate all the way up to the journal
- * has the new object safely replaced the old one.
- *
- * To handle this problem, a struct logfs_shadow is used to represent
- * every single write. It is attached to the indirect block, which is
- * marked dirty. When the indirect block is written, its shadows are
- * handed up to the next indirect block (or inode). Untimately they
- * will reach the master inode and be freed upon journal commit.
- *
- * This function handles a single step in the propagation. It adds the
- * shadow for the current write to the tree, along with any shadows in
- * the page's tree, in case it was an indirect block. If a page is
- * written, the inode parameter is left NULL, if an inode is written,
- * the page parameter is left NULL.
- */
-static void fill_shadow_tree(struct inode *inode, struct page *page,
- struct logfs_shadow *shadow)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- struct logfs_block *block = logfs_block(page);
- struct shadow_tree *tree = &super->s_shadow_tree;
-
- if (PagePrivate(page)) {
- if (block->alias_map)
- super->s_no_object_aliases -= bitmap_weight(
- block->alias_map, LOGFS_BLOCK_FACTOR);
- logfs_handle_transaction(inode, block->ta);
- block->ops->free_block(inode->i_sb, block);
- }
- if (shadow) {
- if (shadow->old_ofs)
- btree_insert64(&tree->old, shadow->old_ofs, shadow,
- GFP_NOFS);
- else
- btree_insert64(&tree->new, shadow->new_ofs, shadow,
- GFP_NOFS);
-
- super->s_dirty_used_bytes += shadow->new_len;
- super->s_dirty_free_bytes += shadow->old_len;
- mark_segment(tree, shadow->old_ofs >> super->s_segshift);
- mark_segment(tree, shadow->new_ofs >> super->s_segshift);
- }
-}
-
-static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
- long child_no)
-{
- struct logfs_super *super = logfs_super(sb);
-
- if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
- /* Aliases in the master inode are pointless. */
- return;
- }
-
- if (!test_bit(child_no, block->alias_map)) {
- set_bit(child_no, block->alias_map);
- super->s_no_object_aliases++;
- }
- list_move_tail(&block->alias_list, &super->s_object_alias);
-}
-
-/*
- * Object aliases can and often do change the size and occupied space of a
- * file. So not only do we have to change the pointers, we also have to
- * change inode->i_size and li->li_used_bytes. Which is done by setting
- * another two object aliases for the inode itself.
- */
-static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
-{
- struct logfs_inode *li = logfs_inode(inode);
-
- if (shadow->new_len == shadow->old_len)
- return;
-
- alloc_inode_block(inode);
- li->li_used_bytes += shadow->new_len - shadow->old_len;
- __logfs_set_blocks(inode);
- logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
- logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
-}
-
-static int logfs_write_i0(struct inode *inode, struct page *page,
- struct write_control *wc)
-{
- struct logfs_shadow *shadow;
- u64 bix;
- level_t level;
- int full, err = 0;
-
- logfs_unpack_index(page->index, &bix, &level);
- if (wc->ofs == 0)
- if (logfs_reserve_blocks(inode, 1))
- return -ENOSPC;
-
- shadow = alloc_shadow(inode, bix, level, wc->ofs);
- if (wc->flags & WF_WRITE)
- err = logfs_segment_write(inode, page, shadow);
- if (wc->flags & WF_DELETE)
- logfs_segment_delete(inode, shadow);
- if (err) {
- free_shadow(inode, shadow);
- return err;
- }
-
- set_iused(inode, shadow);
- full = 1;
- if (level != 0) {
- alloc_indirect_block(inode, page, 0);
- full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
- }
- fill_shadow_tree(inode, page, shadow);
- wc->ofs = shadow->new_ofs;
- if (wc->ofs && full)
- wc->ofs |= LOGFS_FULLY_POPULATED;
- return 0;
-}
-
-static int logfs_write_direct(struct inode *inode, struct page *page,
- long flags)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct write_control wc = {
- .ofs = li->li_data[page->index],
- .flags = flags,
- };
- int err;
-
- alloc_inode_block(inode);
-
- err = logfs_write_i0(inode, page, &wc);
- if (err)
- return err;
-
- li->li_data[page->index] = wc.ofs;
- logfs_set_alias(inode->i_sb, li->li_block,
- page->index + INODE_POINTER_OFS);
- return 0;
-}
-
-static int ptr_change(u64 ofs, struct page *page)
-{
- struct logfs_block *block = logfs_block(page);
- int empty0, empty1, full0, full1;
-
- empty0 = ofs == 0;
- empty1 = block->partial == 0;
- if (empty0 != empty1)
- return 1;
-
- /* The !! is necessary to shrink result to int */
- full0 = !!(ofs & LOGFS_FULLY_POPULATED);
- full1 = block->full == LOGFS_BLOCK_FACTOR;
- if (full0 != full1)
- return 1;
- return 0;
-}
-
-static int __logfs_write_rec(struct inode *inode, struct page *page,
- struct write_control *this_wc,
- pgoff_t bix, level_t target_level, level_t level)
-{
- int ret, page_empty = 0;
- int child_no = get_bits(bix, SUBLEVEL(level));
- struct page *ipage;
- struct write_control child_wc = {
- .flags = this_wc->flags,
- };
-
- ipage = logfs_get_write_page(inode, bix, level);
- if (!ipage)
- return -ENOMEM;
-
- if (this_wc->ofs) {
- ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
- if (ret)
- goto out;
- } else if (!PageUptodate(ipage)) {
- page_empty = 1;
- logfs_read_empty(ipage);
- }
-
- child_wc.ofs = block_get_pointer(ipage, child_no);
-
- if ((__force u8)level-1 > (__force u8)target_level)
- ret = __logfs_write_rec(inode, page, &child_wc, bix,
- target_level, SUBLEVEL(level));
- else
- ret = logfs_write_i0(inode, page, &child_wc);
-
- if (ret)
- goto out;
-
- alloc_indirect_block(inode, ipage, page_empty);
- block_set_pointer(ipage, child_no, child_wc.ofs);
- /* FIXME: first condition seems superfluous */
- if (child_wc.ofs || logfs_block(ipage)->partial)
- this_wc->flags |= WF_WRITE;
- /* the condition on this_wc->ofs ensures that we won't consume extra
- * space for indirect blocks in the future, which we cannot reserve */
- if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
- ret = logfs_write_i0(inode, ipage, this_wc);
- else
- logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
-out:
- logfs_put_write_page(ipage);
- return ret;
-}
-
-static int logfs_write_rec(struct inode *inode, struct page *page,
- pgoff_t bix, level_t target_level, long flags)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct write_control wc = {
- .ofs = li->li_data[INDIRECT_INDEX],
- .flags = flags,
- };
- int ret;
-
- alloc_inode_block(inode);
-
- if (li->li_height > (__force u8)target_level)
- ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
- LEVEL(li->li_height));
- else
- ret = logfs_write_i0(inode, page, &wc);
- if (ret)
- return ret;
-
- if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
- li->li_data[INDIRECT_INDEX] = wc.ofs;
- logfs_set_alias(inode->i_sb, li->li_block,
- INDIRECT_INDEX + INODE_POINTER_OFS);
- }
- return ret;
-}
-
-void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
- alloc_inode_block(inode);
- logfs_inode(inode)->li_block->ta = ta;
-}
-
-void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
- struct logfs_block *block = logfs_inode(inode)->li_block;
-
- if (block && block->ta)
- block->ta = NULL;
-}
-
-static int grow_inode(struct inode *inode, u64 bix, level_t level)
-{
- struct logfs_inode *li = logfs_inode(inode);
- u8 height = (__force u8)level;
- struct page *page;
- struct write_control wc = {
- .flags = WF_WRITE,
- };
- int err;
-
- BUG_ON(height > 5 || li->li_height > 5);
- while (height > li->li_height || bix >= maxbix(li->li_height)) {
- page = logfs_get_write_page(inode, I0_BLOCKS + 1,
- LEVEL(li->li_height + 1));
- if (!page)
- return -ENOMEM;
- logfs_read_empty(page);
- alloc_indirect_block(inode, page, 1);
- block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
- err = logfs_write_i0(inode, page, &wc);
- logfs_put_write_page(page);
- if (err)
- return err;
- li->li_data[INDIRECT_INDEX] = wc.ofs;
- wc.ofs = 0;
- li->li_height++;
- logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
- }
- return 0;
-}
-
-static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
-{
- struct logfs_super *super = logfs_super(inode->i_sb);
- pgoff_t index = page->index;
- u64 bix;
- level_t level;
- int err;
-
- flags |= WF_WRITE | WF_DELETE;
- inode->i_ctime = inode->i_mtime = current_time(inode);
-
- logfs_unpack_index(index, &bix, &level);
- if (logfs_block(page) && logfs_block(page)->reserved_bytes)
- super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
-
- if (index < I0_BLOCKS)
- return logfs_write_direct(inode, page, flags);
-
- bix = adjust_bix(bix, level);
- err = grow_inode(inode, bix, level);
- if (err)
- return err;
- return logfs_write_rec(inode, page, bix, level, flags);
-}
-
-int logfs_write_buf(struct inode *inode, struct page *page, long flags)
-{
- struct super_block *sb = inode->i_sb;
- int ret;
-
- logfs_get_wblocks(sb, page, flags & WF_LOCK);
- ret = __logfs_write_buf(inode, page, flags);
- logfs_put_wblocks(sb, page, flags & WF_LOCK);
- return ret;
-}
-
-static int __logfs_delete(struct inode *inode, struct page *page)
-{
- long flags = WF_DELETE;
- int err;
-
- inode->i_ctime = inode->i_mtime = current_time(inode);
-
- if (page->index < I0_BLOCKS)
- return logfs_write_direct(inode, page, flags);
- err = grow_inode(inode, page->index, 0);
- if (err)
- return err;
- return logfs_write_rec(inode, page, page->index, 0, flags);
-}
-
-int logfs_delete(struct inode *inode, pgoff_t index,
- struct shadow_tree *shadow_tree)
-{
- struct super_block *sb = inode->i_sb;
- struct page *page;
- int ret;
-
- page = logfs_get_read_page(inode, index, 0);
- if (!page)
- return -ENOMEM;
-
- logfs_get_wblocks(sb, page, 1);
- ret = __logfs_delete(inode, page);
- logfs_put_wblocks(sb, page, 1);
-
- logfs_put_read_page(page);
-
- return ret;
-}
-
-int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
- gc_level_t gc_level, long flags)
-{
- level_t level = shrink_level(gc_level);
- struct page *page;
- int err;
-
- page = logfs_get_write_page(inode, bix, level);
- if (!page)
- return -ENOMEM;
-
- err = logfs_segment_read(inode, page, ofs, bix, level);
- if (!err) {
- if (level != 0)
- alloc_indirect_block(inode, page, 0);
- err = logfs_write_buf(inode, page, flags);
- if (!err && shrink_level(gc_level) == 0) {
- /* Rewrite cannot mark the inode dirty but has to
- * write it immediately.
- * Q: Can't we just create an alias for the inode
- * instead? And if not, why not?
- */
- if (inode->i_ino == LOGFS_INO_MASTER)
- logfs_write_anchor(inode->i_sb);
- else {
- err = __logfs_write_inode(inode, page, flags);
- }
- }
- }
- logfs_put_write_page(page);
- return err;
-}
-
-static int truncate_data_block(struct inode *inode, struct page *page,
- u64 ofs, struct logfs_shadow *shadow, u64 size)
-{
- loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
- u64 bix;
- level_t level;
- int err;
-
- /* Does truncation happen within this page? */
- if (size <= pageofs || size - pageofs >= PAGE_SIZE)
- return 0;
-
- logfs_unpack_index(page->index, &bix, &level);
- BUG_ON(level != 0);
-
- err = logfs_segment_read(inode, page, ofs, bix, level);
- if (err)
- return err;
-
- zero_user_segment(page, size - pageofs, PAGE_SIZE);
- return logfs_segment_write(inode, page, shadow);
-}
-
-static int logfs_truncate_i0(struct inode *inode, struct page *page,
- struct write_control *wc, u64 size)
-{
- struct logfs_shadow *shadow;
- u64 bix;
- level_t level;
- int err = 0;
-
- logfs_unpack_index(page->index, &bix, &level);
- BUG_ON(level != 0);
- shadow = alloc_shadow(inode, bix, level, wc->ofs);
-
- err = truncate_data_block(inode, page, wc->ofs, shadow, size);
- if (err) {
- free_shadow(inode, shadow);
- return err;
- }
-
- logfs_segment_delete(inode, shadow);
- set_iused(inode, shadow);
- fill_shadow_tree(inode, page, shadow);
- wc->ofs = shadow->new_ofs;
- return 0;
-}
-
-static int logfs_truncate_direct(struct inode *inode, u64 size)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct write_control wc;
- struct page *page;
- int e;
- int err;
-
- alloc_inode_block(inode);
-
- for (e = I0_BLOCKS - 1; e >= 0; e--) {
- if (size > (e+1) * LOGFS_BLOCKSIZE)
- break;
-
- wc.ofs = li->li_data[e];
- if (!wc.ofs)
- continue;
-
- page = logfs_get_write_page(inode, e, 0);
- if (!page)
- return -ENOMEM;
- err = logfs_segment_read(inode, page, wc.ofs, e, 0);
- if (err) {
- logfs_put_write_page(page);
- return err;
- }
- err = logfs_truncate_i0(inode, page, &wc, size);
- logfs_put_write_page(page);
- if (err)
- return err;
-
- li->li_data[e] = wc.ofs;
- }
- return 0;
-}
-
-/* FIXME: these need to become per-sb once we support different blocksizes */
-static u64 __logfs_step[] = {
- 1,
- I1_BLOCKS,
- I2_BLOCKS,
- I3_BLOCKS,
-};
-
-static u64 __logfs_start_index[] = {
- I0_BLOCKS,
- I1_BLOCKS,
- I2_BLOCKS,
- I3_BLOCKS
-};
-
-static inline u64 logfs_step(level_t level)
-{
- return __logfs_step[(__force u8)level];
-}
-
-static inline u64 logfs_factor(u8 level)
-{
- return __logfs_step[level] * LOGFS_BLOCKSIZE;
-}
-
-static inline u64 logfs_start_index(level_t level)
-{
- return __logfs_start_index[(__force u8)level];
-}
-
-static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
-{
- logfs_unpack_index(index, bix, level);
- if (*bix <= logfs_start_index(SUBLEVEL(*level)))
- *bix = 0;
-}
-
-static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
- struct write_control *this_wc, u64 size)
-{
- int truncate_happened = 0;
- int e, err = 0;
- u64 bix, child_bix, next_bix;
- level_t level;
- struct page *page;
- struct write_control child_wc = { /* FIXME: flags */ };
-
- logfs_unpack_raw_index(ipage->index, &bix, &level);
- err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
- if (err)
- return err;
-
- for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
- child_bix = bix + e * logfs_step(SUBLEVEL(level));
- next_bix = child_bix + logfs_step(SUBLEVEL(level));
- if (size > next_bix * LOGFS_BLOCKSIZE)
- break;
-
- child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
- if (!child_wc.ofs)
- continue;
-
- page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
- if (!page)
- return -ENOMEM;
-
- if ((__force u8)level > 1)
- err = __logfs_truncate_rec(inode, page, &child_wc, size);
- else
- err = logfs_truncate_i0(inode, page, &child_wc, size);
- logfs_put_write_page(page);
- if (err)
- return err;
-
- truncate_happened = 1;
- alloc_indirect_block(inode, ipage, 0);
- block_set_pointer(ipage, e, child_wc.ofs);
- }
-
- if (!truncate_happened) {
- printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
- return 0;
- }
-
- this_wc->flags = WF_DELETE;
- if (logfs_block(ipage)->partial)
- this_wc->flags |= WF_WRITE;
-
- return logfs_write_i0(inode, ipage, this_wc);
-}
-
-static int logfs_truncate_rec(struct inode *inode, u64 size)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct write_control wc = {
- .ofs = li->li_data[INDIRECT_INDEX],
- };
- struct page *page;
- int err;
-
- alloc_inode_block(inode);
-
- if (!wc.ofs)
- return 0;
-
- page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
- if (!page)
- return -ENOMEM;
-
- err = __logfs_truncate_rec(inode, page, &wc, size);
- logfs_put_write_page(page);
- if (err)
- return err;
-
- if (li->li_data[INDIRECT_INDEX] != wc.ofs)
- li->li_data[INDIRECT_INDEX] = wc.ofs;
- return 0;
-}
-
-static int __logfs_truncate(struct inode *inode, u64 size)
-{
- int ret;
-
- if (size >= logfs_factor(logfs_inode(inode)->li_height))
- return 0;
-
- ret = logfs_truncate_rec(inode, size);
- if (ret)
- return ret;
-
- return logfs_truncate_direct(inode, size);
-}
-
-/*
- * Truncate, by changing the segment file, can consume a fair amount
- * of resources. So back off from time to time and do some GC.
- * 8 or 2048 blocks should be well within safety limits even if
- * every single block resided in a different segment.
- */
-#define TRUNCATE_STEP (8 * 1024 * 1024)
-int logfs_truncate(struct inode *inode, u64 target)
-{
- struct super_block *sb = inode->i_sb;
- u64 size = i_size_read(inode);
- int err = 0;
-
- size = ALIGN(size, TRUNCATE_STEP);
- while (size > target) {
- if (size > TRUNCATE_STEP)
- size -= TRUNCATE_STEP;
- else
- size = 0;
- if (size < target)
- size = target;
-
- logfs_get_wblocks(sb, NULL, 1);
- err = __logfs_truncate(inode, size);
- if (!err)
- err = __logfs_write_inode(inode, NULL, 0);
- logfs_put_wblocks(sb, NULL, 1);
- }
-
- if (!err) {
- err = inode_newsize_ok(inode, target);
- if (err)
- goto out;
-
- truncate_setsize(inode, target);
- }
-
- out:
- /* I don't trust error recovery yet. */
- WARN_ON(err);
- return err;
-}
-
-static void move_page_to_inode(struct inode *inode, struct page *page)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct logfs_block *block = logfs_block(page);
-
- if (!block)
- return;
-
- log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
- block->ino, block->bix, block->level);
- BUG_ON(li->li_block);
- block->ops = &inode_block_ops;
- block->inode = inode;
- li->li_block = block;
-
- block->page = NULL;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- put_page(page);
- set_page_private(page, 0);
- }
-}
-
-static void move_inode_to_page(struct page *page, struct inode *inode)
-{
- struct logfs_inode *li = logfs_inode(inode);
- struct logfs_block *block = li->li_block;
-
- if (!block)
- return;
-
- log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
- block->ino, block->bix, block->level);
- BUG_ON(PagePrivate(page));
- block->ops = &indirect_block_ops;
- block->page = page;
-
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long) block);
- }
-
- block->inode = NULL;
- li->li_block = NULL;
-}
-
-int logfs_read_inode(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_super *super = logfs_super(sb);
- struct inode *master_inode = super->s_master_inode;
- struct page *page;
- struct logfs_disk_inode *di;
- u64 ino = inode->i_ino;
-
- if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
- return -ENODATA;
- if (!logfs_exist_block(master_inode, ino))
- return -ENODATA;
-
- page = read_cache_page(master_inode->i_mapping, ino,
- (filler_t *)logfs_readpage, NULL);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- di = kmap_atomic(page);
- logfs_disk_to_inode(di, inode);
- kunmap_atomic(di);
- move_page_to_inode(inode, page);
- put_page(page);
- return 0;
-}
-
-/* Caller must logfs_put_write_page(page); */
-static struct page *inode_to_page(struct inode *inode)
-{
- struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
- struct logfs_disk_inode *di;
- struct page *page;
-
- BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
-
- page = logfs_get_write_page(master_inode, inode->i_ino, 0);
- if (!page)
- return NULL;
-
- di = kmap_atomic(page);
- logfs_inode_to_disk(inode, di);
- kunmap_atomic(di);
- move_inode_to_page(page, inode);
- return page;
-}
-
-static int do_write_inode(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct inode *master_inode = logfs_super(sb)->s_master_inode;
- loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
- struct page *page;
- int err;
-
- BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
- /* FIXME: lock inode */
-
- if (i_size_read(master_inode) < size)
- i_size_write(master_inode, size);
-
- /* TODO: Tell vfs this inode is clean now */
-
- page = inode_to_page(inode);
- if (!page)
- return -ENOMEM;
-
- /* FIXME: transaction is part of logfs_block now. Is that enough? */
- err = logfs_write_buf(master_inode, page, 0);
- if (err)
- move_page_to_inode(inode, page);
-
- logfs_put_write_page(page);
- return err;
-}
-
-static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
- int write,
- void (*change_se)(struct logfs_segment_entry *, long),
- long arg)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode;
- struct page *page;
- struct logfs_segment_entry *se;
- pgoff_t page_no;
- int child_no;
-
- page_no = segno >> (sb->s_blocksize_bits - 3);
- child_no = segno & ((sb->s_blocksize >> 3) - 1);
-
- inode = super->s_segfile_inode;
- page = logfs_get_write_page(inode, page_no, 0);
- BUG_ON(!page); /* FIXME: We need some reserve page for this case */
- if (!PageUptodate(page))
- logfs_read_block(inode, page, WRITE);
-
- if (write)
- alloc_indirect_block(inode, page, 0);
- se = kmap_atomic(page);
- change_se(se + child_no, arg);
- if (write) {
- logfs_set_alias(sb, logfs_block(page), child_no);
- BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
- }
- kunmap_atomic(se);
-
- logfs_put_write_page(page);
-}
-
-static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
-{
- struct logfs_segment_entry *target = (void *)_target;
-
- *target = *se;
-}
-
-void logfs_get_segment_entry(struct super_block *sb, u32 segno,
- struct logfs_segment_entry *se)
-{
- logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
-}
-
-static void __set_segment_used(struct logfs_segment_entry *se, long increment)
-{
- u32 valid;
-
- valid = be32_to_cpu(se->valid);
- valid += increment;
- se->valid = cpu_to_be32(valid);
-}
-
-void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
-{
- struct logfs_super *super = logfs_super(sb);
- u32 segno = ofs >> super->s_segshift;
-
- if (!increment)
- return;
-
- logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
-}
-
-static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
-{
- se->ec_level = cpu_to_be32(ec_level);
-}
-
-void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
- gc_level_t gc_level)
-{
- u32 ec_level = ec << 4 | (__force u8)gc_level;
-
- logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
-}
-
-static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
-{
- se->valid = cpu_to_be32(RESERVED);
-}
-
-void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
-{
- logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
-}
-
-static void __set_segment_unreserved(struct logfs_segment_entry *se,
- long ec_level)
-{
- se->valid = 0;
- se->ec_level = cpu_to_be32(ec_level);
-}
-
-void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
-{
- u32 ec_level = ec << 4;
-
- logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
- ec_level);
-}
-
-int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
-{
- struct super_block *sb = inode->i_sb;
- int ret;
-
- logfs_get_wblocks(sb, page, flags & WF_LOCK);
- ret = do_write_inode(inode);
- logfs_put_wblocks(sb, page, flags & WF_LOCK);
- return ret;
-}
-
-static int do_delete_inode(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct inode *master_inode = logfs_super(sb)->s_master_inode;
- struct page *page;
- int ret;
-
- page = logfs_get_write_page(master_inode, inode->i_ino, 0);
- if (!page)
- return -ENOMEM;
-
- move_inode_to_page(page, inode);
-
- logfs_get_wblocks(sb, page, 1);
- ret = __logfs_delete(master_inode, page);
- logfs_put_wblocks(sb, page, 1);
-
- logfs_put_write_page(page);
- return ret;
-}
-
-/*
- * ZOMBIE inodes have already been deleted before and should remain dead,
- * if it weren't for valid checking. No need to kill them again here.
- */
-void logfs_evict_inode(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_inode *li = logfs_inode(inode);
- struct logfs_block *block = li->li_block;
- struct page *page;
-
- if (!inode->i_nlink) {
- if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
- li->li_flags |= LOGFS_IF_ZOMBIE;
- if (i_size_read(inode) > 0)
- logfs_truncate(inode, 0);
- do_delete_inode(inode);
- }
- }
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-
- /* Cheaper version of write_inode. All changes are concealed in
- * aliases, which are moved back. No write to the medium happens.
- */
- /* Only deleted files may be dirty at this point */
- BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
- if (!block)
- return;
- if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
- block->ops->free_block(inode->i_sb, block);
- return;
- }
-
- page = inode_to_page(inode);
- BUG_ON(!page); /* FIXME: Use emergency page */
- logfs_put_write_page(page);
-}
-
-void btree_write_block(struct logfs_block *block)
-{
- struct inode *inode;
- struct page *page;
- int err, cookie;
-
- inode = logfs_safe_iget(block->sb, block->ino, &cookie);
- page = logfs_get_write_page(inode, block->bix, block->level);
-
- err = logfs_readpage_nolock(page);
- BUG_ON(err);
- BUG_ON(!PagePrivate(page));
- BUG_ON(logfs_block(page) != block);
- err = __logfs_write_buf(inode, page, 0);
- BUG_ON(err);
- BUG_ON(PagePrivate(page) || page->private);
-
- logfs_put_write_page(page);
- logfs_safe_iput(inode, cookie);
-}
-
-/**
- * logfs_inode_write - write inode or dentry objects
- *
- * @inode: parent inode (ifile or directory)
- * @buf: object to write (inode or dentry)
- * @count: object size
- * @bix: block index
- * @flags: write flags
- * @shadow_tree: shadow below this inode
- *
- * FIXME: All caller of this put a 200-300 byte variable on the stack,
- * only to call here and do a memcpy from that stack variable. A good
- * example of wasted performance and stack space.
- */
-int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
- loff_t bix, long flags, struct shadow_tree *shadow_tree)
-{
- loff_t pos = bix << inode->i_sb->s_blocksize_bits;
- int err;
- struct page *page;
- void *pagebuf;
-
- BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
- BUG_ON(count > LOGFS_BLOCKSIZE);
- page = logfs_get_write_page(inode, bix, 0);
- if (!page)
- return -ENOMEM;
-
- pagebuf = kmap_atomic(page);
- memcpy(pagebuf, buf, count);
- flush_dcache_page(page);
- kunmap_atomic(pagebuf);
-
- if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
- i_size_write(inode, pos + LOGFS_BLOCKSIZE);
-
- err = logfs_write_buf(inode, page, flags);
- logfs_put_write_page(page);
- return err;
-}
-
-int logfs_open_segfile(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *inode;
-
- inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- super->s_segfile_inode = inode;
- return 0;
-}
-
-int logfs_init_rw(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int min_fill = 3 * super->s_no_blocks;
-
- INIT_LIST_HEAD(&super->s_object_alias);
- INIT_LIST_HEAD(&super->s_writeback_list);
- mutex_init(&super->s_write_mutex);
- super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
- sizeof(struct logfs_block));
- super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
- sizeof(struct logfs_shadow));
- return 0;
-}
-
-void logfs_cleanup_rw(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- logfs_mempool_destroy(super->s_block_pool);
- logfs_mempool_destroy(super->s_shadow_pool);
-}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
deleted file mode 100644
index 1efd6055f4b0..000000000000
--- a/fs/logfs/segment.c
+++ /dev/null
@@ -1,961 +0,0 @@
-/*
- * fs/logfs/segment.c - Handling the Object Store
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Object store or ostore makes up the complete device with exception of
- * the superblock and journal areas. Apart from its own metadata it stores
- * three kinds of objects: inodes, dentries and blocks, both data and indirect.
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct btree_head32 *head = &super->s_reserved_segments;
- int err;
-
- err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
- if (err)
- return err;
- logfs_super(sb)->s_bad_segments++;
- /* FIXME: write to journal */
- return 0;
-}
-
-int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
-{
- struct logfs_super *super = logfs_super(sb);
-
- super->s_gec++;
-
- return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
- super->s_segsize, ensure_erase);
-}
-
-static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
-{
- s32 ofs;
-
- logfs_open_area(area, bytes);
-
- ofs = area->a_used_bytes;
- area->a_used_bytes += bytes;
- BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
-
- return dev_ofs(area->a_sb, area->a_segno, ofs);
-}
-
-static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
- int use_filler)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- filler_t *filler = super->s_devops->readpage;
- struct page *page;
-
- BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
- if (use_filler)
- page = read_cache_page(mapping, index, filler, sb);
- else {
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (page)
- unlock_page(page);
- }
- return page;
-}
-
-int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
- int use_filler)
-{
- pgoff_t index = ofs >> PAGE_SHIFT;
- struct page *page;
- long offset = ofs & (PAGE_SIZE-1);
- long copylen;
-
- /* Only logfs_wbuf_recover may use len==0 */
- BUG_ON(!len && !use_filler);
- do {
- copylen = min((ulong)len, PAGE_SIZE - offset);
-
- page = get_mapping_page(area->a_sb, index, use_filler);
- if (IS_ERR(page))
- return PTR_ERR(page);
- BUG_ON(!page); /* FIXME: reserve a pool */
- SetPageUptodate(page);
- memcpy(page_address(page) + offset, buf, copylen);
-
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- }
- put_page(page);
-
- buf += copylen;
- len -= copylen;
- offset = 0;
- index++;
- } while (len);
- return 0;
-}
-
-static void pad_partial_page(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- struct page *page;
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
- pgoff_t index = ofs >> PAGE_SHIFT;
- long offset = ofs & (PAGE_SIZE-1);
- u32 len = PAGE_SIZE - offset;
-
- if (len % PAGE_SIZE) {
- page = get_mapping_page(sb, index, 0);
- BUG_ON(!page); /* FIXME: reserve a pool */
- memset(page_address(page) + offset, 0xff, len);
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- }
- put_page(page);
- }
-}
-
-static void pad_full_pages(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- struct logfs_super *super = logfs_super(sb);
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
- u32 len = super->s_segsize - area->a_used_bytes;
- pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
- pgoff_t no_indizes = len >> PAGE_SHIFT;
- struct page *page;
-
- while (no_indizes) {
- page = get_mapping_page(sb, index, 0);
- BUG_ON(!page); /* FIXME: reserve a pool */
- SetPageUptodate(page);
- memset(page_address(page), 0xff, PAGE_SIZE);
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- }
- put_page(page);
- index++;
- no_indizes--;
- }
-}
-
-/*
- * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
- * Also make sure we allocate (and memset) all pages for final writeout.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
-{
- pad_partial_page(area);
- if (final)
- pad_full_pages(area);
-}
-
-/*
- * We have to be careful with the alias tree. Since lookup is done by bix,
- * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
- * indirect blocks. So always use it through accessor functions.
- */
-static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
- level_t level)
-{
- struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
- pgoff_t index = logfs_pack_index(bix, level);
-
- return btree_lookup128(head, ino, index);
-}
-
-static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
- level_t level, void *val)
-{
- struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
- pgoff_t index = logfs_pack_index(bix, level);
-
- return btree_insert128(head, ino, index, val, GFP_NOFS);
-}
-
-static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
- write_alias_t *write_one_alias)
-{
- struct object_alias_item *item;
- int err;
-
- list_for_each_entry(item, &block->item_list, list) {
- err = write_alias_journal(sb, block->ino, block->bix,
- block->level, item->child_no, item->val);
- if (err)
- return err;
- }
- return 0;
-}
-
-static const struct logfs_block_ops btree_block_ops = {
- .write_block = btree_write_block,
- .free_block = __free_block,
- .write_alias = btree_write_alias,
-};
-
-int logfs_load_object_aliases(struct super_block *sb,
- struct logfs_obj_alias *oa, int count)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_block *block;
- struct object_alias_item *item;
- u64 ino, bix;
- level_t level;
- int i, err;
-
- super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
- count /= sizeof(*oa);
- for (i = 0; i < count; i++) {
- item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
- if (!item)
- return -ENOMEM;
- memset(item, 0, sizeof(*item));
-
- super->s_no_object_aliases++;
- item->val = oa[i].val;
- item->child_no = be16_to_cpu(oa[i].child_no);
-
- ino = be64_to_cpu(oa[i].ino);
- bix = be64_to_cpu(oa[i].bix);
- level = LEVEL(oa[i].level);
-
- log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
- ino, bix, level, item->child_no,
- be64_to_cpu(item->val));
- block = alias_tree_lookup(sb, ino, bix, level);
- if (!block) {
- block = __alloc_block(sb, ino, bix, level);
- block->ops = &btree_block_ops;
- err = alias_tree_insert(sb, ino, bix, level, block);
- BUG_ON(err); /* mempool empty */
- }
- if (test_and_set_bit(item->child_no, block->alias_map)) {
- printk(KERN_ERR"LogFS: Alias collision detected\n");
- return -EIO;
- }
- list_move_tail(&block->alias_list, &super->s_object_alias);
- list_add(&item->list, &block->item_list);
- }
- return 0;
-}
-
-static void kill_alias(void *_block, unsigned long ignore0,
- u64 ignore1, u64 ignore2, size_t ignore3)
-{
- struct logfs_block *block = _block;
- struct super_block *sb = block->sb;
- struct logfs_super *super = logfs_super(sb);
- struct object_alias_item *item;
-
- while (!list_empty(&block->item_list)) {
- item = list_entry(block->item_list.next, typeof(*item), list);
- list_del(&item->list);
- mempool_free(item, super->s_alias_pool);
- }
- block->ops->free_block(sb, block);
-}
-
-static int obj_type(struct inode *inode, level_t level)
-{
- if (level == 0) {
- if (S_ISDIR(inode->i_mode))
- return OBJ_DENTRY;
- if (inode->i_ino == LOGFS_INO_MASTER)
- return OBJ_INODE;
- }
- return OBJ_BLOCK;
-}
-
-static int obj_len(struct super_block *sb, int obj_type)
-{
- switch (obj_type) {
- case OBJ_DENTRY:
- return sizeof(struct logfs_disk_dentry);
- case OBJ_INODE:
- return sizeof(struct logfs_disk_inode);
- case OBJ_BLOCK:
- return sb->s_blocksize;
- default:
- BUG();
- }
-}
-
-static int __logfs_segment_write(struct inode *inode, void *buf,
- struct logfs_shadow *shadow, int type, int len, int compr)
-{
- struct logfs_area *area;
- struct super_block *sb = inode->i_sb;
- s64 ofs;
- struct logfs_object_header h;
- int acc_len;
-
- if (shadow->gc_level == 0)
- acc_len = len;
- else
- acc_len = obj_len(sb, type);
-
- area = get_area(sb, shadow->gc_level);
- ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
- LOGFS_BUG_ON(ofs <= 0, sb);
- /*
- * Order is important. logfs_get_free_bytes(), by modifying the
- * segment file, may modify the content of the very page we're about
- * to write now. Which is fine, as long as the calculated crc and
- * written data still match. So do the modifications _before_
- * calculating the crc.
- */
-
- h.len = cpu_to_be16(len);
- h.type = type;
- h.compr = compr;
- h.ino = cpu_to_be64(inode->i_ino);
- h.bix = cpu_to_be64(shadow->bix);
- h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
- h.data_crc = logfs_crc32(buf, len, 0);
-
- logfs_buf_write(area, ofs, &h, sizeof(h));
- logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
-
- shadow->new_ofs = ofs;
- shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
-
- return 0;
-}
-
-static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
- struct logfs_shadow *shadow, int type, int len)
-{
- struct super_block *sb = inode->i_sb;
- void *compressor_buf = logfs_super(sb)->s_compressed_je;
- ssize_t compr_len;
- int ret;
-
- mutex_lock(&logfs_super(sb)->s_journal_mutex);
- compr_len = logfs_compress(buf, compressor_buf, len, len);
-
- if (compr_len >= 0) {
- ret = __logfs_segment_write(inode, compressor_buf, shadow,
- type, compr_len, COMPR_ZLIB);
- } else {
- ret = __logfs_segment_write(inode, buf, shadow, type, len,
- COMPR_NONE);
- }
- mutex_unlock(&logfs_super(sb)->s_journal_mutex);
- return ret;
-}
-
-/**
- * logfs_segment_write - write data block to object store
- * @inode: inode containing data
- *
- * Returns an errno or zero.
- */
-int logfs_segment_write(struct inode *inode, struct page *page,
- struct logfs_shadow *shadow)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_super *super = logfs_super(sb);
- int do_compress, type, len;
- int ret;
- void *buf;
-
- super->s_flags |= LOGFS_SB_FLAG_DIRTY;
- BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
- do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
- if (shadow->gc_level != 0) {
- /* temporarily disable compression for indirect blocks */
- do_compress = 0;
- }
-
- type = obj_type(inode, shrink_level(shadow->gc_level));
- len = obj_len(sb, type);
- buf = kmap(page);
- if (do_compress)
- ret = logfs_segment_write_compress(inode, buf, shadow, type,
- len);
- else
- ret = __logfs_segment_write(inode, buf, shadow, type, len,
- COMPR_NONE);
- kunmap(page);
-
- log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
- shadow->ino, shadow->bix, shadow->gc_level,
- shadow->old_ofs, shadow->new_ofs,
- shadow->old_len, shadow->new_len);
- /* this BUG_ON did catch a locking bug. useful */
- BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
- return ret;
-}
-
-int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
-{
- pgoff_t index = ofs >> PAGE_SHIFT;
- struct page *page;
- long offset = ofs & (PAGE_SIZE-1);
- long copylen;
-
- while (len) {
- copylen = min((ulong)len, PAGE_SIZE - offset);
-
- page = get_mapping_page(sb, index, 1);
- if (IS_ERR(page))
- return PTR_ERR(page);
- memcpy(buf, page_address(page) + offset, copylen);
- put_page(page);
-
- buf += copylen;
- len -= copylen;
- offset = 0;
- index++;
- }
- return 0;
-}
-
-/*
- * The "position" of indirect blocks is ambiguous. It can be the position
- * of any data block somewhere behind this indirect block. So we need to
- * normalize the positions through logfs_block_mask() before comparing.
- */
-static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
-{
- return (pos1 & logfs_block_mask(sb, level)) !=
- (pos2 & logfs_block_mask(sb, level));
-}
-
-#if 0
-static int read_seg_header(struct super_block *sb, u64 ofs,
- struct logfs_segment_header *sh)
-{
- __be32 crc;
- int err;
-
- err = wbuf_read(sb, ofs, sizeof(*sh), sh);
- if (err)
- return err;
- crc = logfs_crc32(sh, sizeof(*sh), 4);
- if (crc != sh->crc) {
- printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
- "got %x\n", ofs, be32_to_cpu(sh->crc),
- be32_to_cpu(crc));
- return -EIO;
- }
- return 0;
-}
-#endif
-
-static int read_obj_header(struct super_block *sb, u64 ofs,
- struct logfs_object_header *oh)
-{
- __be32 crc;
- int err;
-
- err = wbuf_read(sb, ofs, sizeof(*oh), oh);
- if (err)
- return err;
- crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
- if (crc != oh->crc) {
- printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
- "got %x\n", ofs, be32_to_cpu(oh->crc),
- be32_to_cpu(crc));
- return -EIO;
- }
- return 0;
-}
-
-static void move_btree_to_page(struct inode *inode, struct page *page,
- __be64 *data)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_super *super = logfs_super(sb);
- struct btree_head128 *head = &super->s_object_alias_tree;
- struct logfs_block *block;
- struct object_alias_item *item, *next;
-
- if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
- return;
-
- block = btree_remove128(head, inode->i_ino, page->index);
- if (!block)
- return;
-
- log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
- block->ino, block->bix, block->level);
- list_for_each_entry_safe(item, next, &block->item_list, list) {
- data[item->child_no] = item->val;
- list_del(&item->list);
- mempool_free(item, super->s_alias_pool);
- }
- block->page = page;
-
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long) block);
- }
- block->ops = &indirect_block_ops;
- initialize_block_counters(page, block, data, 0);
-}
-
-/*
- * This silences a false, yet annoying gcc warning. I hate it when my editor
- * jumps into bitops.h each time I recompile this file.
- * TODO: Complain to gcc folks about this and upgrade compiler.
- */
-static unsigned long fnb(const unsigned long *addr,
- unsigned long size, unsigned long offset)
-{
- return find_next_bit(addr, size, offset);
-}
-
-void move_page_to_btree(struct page *page)
-{
- struct logfs_block *block = logfs_block(page);
- struct super_block *sb = block->sb;
- struct logfs_super *super = logfs_super(sb);
- struct object_alias_item *item;
- unsigned long pos;
- __be64 *child;
- int err;
-
- if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
- block->ops->free_block(sb, block);
- return;
- }
- log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
- block->ino, block->bix, block->level);
- super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
-
- for (pos = 0; ; pos++) {
- pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
- if (pos >= LOGFS_BLOCK_FACTOR)
- break;
-
- item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
- BUG_ON(!item); /* mempool empty */
- memset(item, 0, sizeof(*item));
-
- child = kmap_atomic(page);
- item->val = child[pos];
- kunmap_atomic(child);
- item->child_no = pos;
- list_add(&item->list, &block->item_list);
- }
- block->page = NULL;
-
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- put_page(page);
- set_page_private(page, 0);
- }
- block->ops = &btree_block_ops;
- err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
- block);
- BUG_ON(err); /* mempool empty */
- ClearPageUptodate(page);
-}
-
-static int __logfs_segment_read(struct inode *inode, void *buf,
- u64 ofs, u64 bix, level_t level)
-{
- struct super_block *sb = inode->i_sb;
- void *compressor_buf = logfs_super(sb)->s_compressed_je;
- struct logfs_object_header oh;
- __be32 crc;
- u16 len;
- int err, block_len;
-
- block_len = obj_len(sb, obj_type(inode, level));
- err = read_obj_header(sb, ofs, &oh);
- if (err)
- goto out_err;
-
- err = -EIO;
- if (be64_to_cpu(oh.ino) != inode->i_ino
- || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
- printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
- "expected (%lx, %llx), got (%llx, %llx)\n",
- ofs, inode->i_ino, bix,
- be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
- goto out_err;
- }
-
- len = be16_to_cpu(oh.len);
-
- switch (oh.compr) {
- case COMPR_NONE:
- err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
- if (err)
- goto out_err;
- crc = logfs_crc32(buf, len, 0);
- if (crc != oh.data_crc) {
- printk(KERN_ERR"LOGFS: uncompressed data crc error at "
- "%llx: expected %x, got %x\n", ofs,
- be32_to_cpu(oh.data_crc),
- be32_to_cpu(crc));
- goto out_err;
- }
- break;
- case COMPR_ZLIB:
- mutex_lock(&logfs_super(sb)->s_journal_mutex);
- err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
- compressor_buf);
- if (err) {
- mutex_unlock(&logfs_super(sb)->s_journal_mutex);
- goto out_err;
- }
- crc = logfs_crc32(compressor_buf, len, 0);
- if (crc != oh.data_crc) {
- printk(KERN_ERR"LOGFS: compressed data crc error at "
- "%llx: expected %x, got %x\n", ofs,
- be32_to_cpu(oh.data_crc),
- be32_to_cpu(crc));
- mutex_unlock(&logfs_super(sb)->s_journal_mutex);
- goto out_err;
- }
- err = logfs_uncompress(compressor_buf, buf, len, block_len);
- mutex_unlock(&logfs_super(sb)->s_journal_mutex);
- if (err) {
- printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
- goto out_err;
- }
- break;
- default:
- LOGFS_BUG(sb);
- err = -EIO;
- goto out_err;
- }
- return 0;
-
-out_err:
- logfs_set_ro(sb);
- printk(KERN_ERR"LOGFS: device is read-only now\n");
- LOGFS_BUG(sb);
- return err;
-}
-
-/**
- * logfs_segment_read - read data block from object store
- * @inode: inode containing data
- * @buf: data buffer
- * @ofs: physical data offset
- * @bix: block index
- * @level: block level
- *
- * Returns 0 on success or a negative errno.
- */
-int logfs_segment_read(struct inode *inode, struct page *page,
- u64 ofs, u64 bix, level_t level)
-{
- int err;
- void *buf;
-
- if (PageUptodate(page))
- return 0;
-
- ofs &= ~LOGFS_FULLY_POPULATED;
-
- buf = kmap(page);
- err = __logfs_segment_read(inode, buf, ofs, bix, level);
- if (!err) {
- move_btree_to_page(inode, page, buf);
- SetPageUptodate(page);
- }
- kunmap(page);
- log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
- inode->i_ino, bix, level, ofs, err);
- return err;
-}
-
-int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
-{
- struct super_block *sb = inode->i_sb;
- struct logfs_super *super = logfs_super(sb);
- struct logfs_object_header h;
- u16 len;
- int err;
-
- super->s_flags |= LOGFS_SB_FLAG_DIRTY;
- BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
- BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
- if (!shadow->old_ofs)
- return 0;
-
- log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
- shadow->ino, shadow->bix, shadow->gc_level,
- shadow->old_ofs, shadow->new_ofs,
- shadow->old_len, shadow->new_len);
- err = read_obj_header(sb, shadow->old_ofs, &h);
- LOGFS_BUG_ON(err, sb);
- LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
- LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
- shrink_level(shadow->gc_level)), sb);
-
- if (shadow->gc_level == 0)
- len = be16_to_cpu(h.len);
- else
- len = obj_len(sb, h.type);
- shadow->old_len = len + sizeof(h);
- return 0;
-}
-
-void freeseg(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping = super->s_mapping_inode->i_mapping;
- struct page *page;
- u64 ofs, start, end;
-
- start = dev_ofs(sb, segno, 0);
- end = dev_ofs(sb, segno + 1, 0);
- for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
- page = find_get_page(mapping, ofs >> PAGE_SHIFT);
- if (!page)
- continue;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- put_page(page);
- }
- put_page(page);
- }
-}
-
-int logfs_open_area(struct logfs_area *area, size_t bytes)
-{
- struct super_block *sb = area->a_sb;
- struct logfs_super *super = logfs_super(sb);
- int err, closed = 0;
-
- if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
- return 0;
-
- if (area->a_is_open) {
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
- u32 len = super->s_segsize - area->a_written_bytes;
-
- log_gc("logfs_close_area(%x)\n", area->a_segno);
- pad_wbuf(area, 1);
- super->s_devops->writeseg(area->a_sb, ofs, len);
- freeseg(sb, area->a_segno);
- closed = 1;
- }
-
- area->a_used_bytes = 0;
- area->a_written_bytes = 0;
-again:
- area->a_ops->get_free_segment(area);
- area->a_ops->get_erase_count(area);
-
- log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
- err = area->a_ops->erase_segment(area);
- if (err) {
- printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
- area->a_segno);
- logfs_mark_segment_bad(sb, area->a_segno);
- goto again;
- }
- area->a_is_open = 1;
- return closed;
-}
-
-void logfs_sync_area(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- struct logfs_super *super = logfs_super(sb);
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
- u32 len = (area->a_used_bytes - area->a_written_bytes);
-
- if (super->s_writesize)
- len &= ~(super->s_writesize - 1);
- if (len == 0)
- return;
- pad_wbuf(area, 0);
- super->s_devops->writeseg(sb, ofs, len);
- area->a_written_bytes += len;
-}
-
-void logfs_sync_segments(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- for_each_area(i)
- logfs_sync_area(super->s_area[i]);
-}
-
-/*
- * Pick a free segment to be used for this area. Effectively takes a
- * candidate from the free list (not really a candidate anymore).
- */
-static void ostore_get_free_segment(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- struct logfs_super *super = logfs_super(sb);
-
- if (super->s_free_list.count == 0) {
- printk(KERN_ERR"LOGFS: ran out of free segments\n");
- LOGFS_BUG(sb);
- }
-
- area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
-}
-
-static void ostore_get_erase_count(struct logfs_area *area)
-{
- struct logfs_segment_entry se;
- u32 ec_level;
-
- logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
- BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
- se.valid == cpu_to_be32(RESERVED));
-
- ec_level = be32_to_cpu(se.ec_level);
- area->a_erase_count = (ec_level >> 4) + 1;
-}
-
-static int ostore_erase_segment(struct logfs_area *area)
-{
- struct super_block *sb = area->a_sb;
- struct logfs_segment_header sh;
- u64 ofs;
- int err;
-
- err = logfs_erase_segment(sb, area->a_segno, 0);
- if (err)
- return err;
-
- sh.pad = 0;
- sh.type = SEG_OSTORE;
- sh.level = (__force u8)area->a_level;
- sh.segno = cpu_to_be32(area->a_segno);
- sh.ec = cpu_to_be32(area->a_erase_count);
- sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
- sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
-
- logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
- area->a_level);
-
- ofs = dev_ofs(sb, area->a_segno, 0);
- area->a_used_bytes = sizeof(sh);
- logfs_buf_write(area, ofs, &sh, sizeof(sh));
- return 0;
-}
-
-static const struct logfs_area_ops ostore_area_ops = {
- .get_free_segment = ostore_get_free_segment,
- .get_erase_count = ostore_get_erase_count,
- .erase_segment = ostore_erase_segment,
-};
-
-static void free_area(struct logfs_area *area)
-{
- if (area)
- freeseg(area->a_sb, area->a_segno);
- kfree(area);
-}
-
-void free_areas(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- for_each_area(i)
- free_area(super->s_area[i]);
- free_area(super->s_journal_area);
-}
-
-static struct logfs_area *alloc_area(struct super_block *sb)
-{
- struct logfs_area *area;
-
- area = kzalloc(sizeof(*area), GFP_KERNEL);
- if (!area)
- return NULL;
-
- area->a_sb = sb;
- return area;
-}
-
-static void map_invalidatepage(struct page *page, unsigned int o,
- unsigned int l)
-{
- return;
-}
-
-static int map_releasepage(struct page *page, gfp_t g)
-{
- /* Don't release these pages */
- return 0;
-}
-
-static const struct address_space_operations mapping_aops = {
- .invalidatepage = map_invalidatepage,
- .releasepage = map_releasepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
-};
-
-int logfs_init_mapping(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct address_space *mapping;
- struct inode *inode;
-
- inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- super->s_mapping_inode = inode;
- mapping = inode->i_mapping;
- mapping->a_ops = &mapping_aops;
- /* Would it be possible to use __GFP_HIGHMEM as well? */
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- return 0;
-}
-
-int logfs_init_areas(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i = -1;
-
- super->s_alias_pool = mempool_create_kmalloc_pool(600,
- sizeof(struct object_alias_item));
- if (!super->s_alias_pool)
- return -ENOMEM;
-
- super->s_journal_area = alloc_area(sb);
- if (!super->s_journal_area)
- goto err;
-
- for_each_area(i) {
- super->s_area[i] = alloc_area(sb);
- if (!super->s_area[i])
- goto err;
- super->s_area[i]->a_level = GC_LEVEL(i);
- super->s_area[i]->a_ops = &ostore_area_ops;
- }
- btree_init_mempool128(&super->s_object_alias_tree,
- super->s_btree_pool);
- return 0;
-
-err:
- for (i--; i >= 0; i--)
- free_area(super->s_area[i]);
- free_area(super->s_journal_area);
- logfs_mempool_destroy(super->s_alias_pool);
- return -ENOMEM;
-}
-
-void logfs_cleanup_areas(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
-}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
deleted file mode 100644
index 5751082dba52..000000000000
--- a/fs/logfs/super.c
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- * fs/logfs/super.c
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Generally contains mount/umount code and also serves as a dump area for
- * any functions that don't fit elsewhere and neither justify a file of their
- * own.
- */
-#include "logfs.h"
-#include <linux/bio.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/statfs.h>
-#include <linux/buffer_head.h>
-
-static DEFINE_MUTEX(emergency_mutex);
-static struct page *emergency_page;
-
-struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
-{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- struct page *page;
- int err;
-
- page = read_cache_page(mapping, index, filler, NULL);
- if (page)
- return page;
-
- /* No more pages available, switch to emergency page */
- printk(KERN_INFO"Logfs: Using emergency page\n");
- mutex_lock(&emergency_mutex);
- err = filler(NULL, emergency_page);
- if (err) {
- mutex_unlock(&emergency_mutex);
- printk(KERN_EMERG"Logfs: Error reading emergency page\n");
- return ERR_PTR(err);
- }
- return emergency_page;
-}
-
-void emergency_read_end(struct page *page)
-{
- if (page == emergency_page)
- mutex_unlock(&emergency_mutex);
- else
- put_page(page);
-}
-
-static void dump_segfile(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_segment_entry se;
- u32 segno;
-
- for (segno = 0; segno < super->s_no_segs; segno++) {
- logfs_get_segment_entry(sb, segno, &se);
- printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
- be32_to_cpu(se.valid));
- if (++segno < super->s_no_segs) {
- logfs_get_segment_entry(sb, segno, &se);
- printk(" %6x %8x", be32_to_cpu(se.ec_level),
- be32_to_cpu(se.valid));
- }
- if (++segno < super->s_no_segs) {
- logfs_get_segment_entry(sb, segno, &se);
- printk(" %6x %8x", be32_to_cpu(se.ec_level),
- be32_to_cpu(se.valid));
- }
- if (++segno < super->s_no_segs) {
- logfs_get_segment_entry(sb, segno, &se);
- printk(" %6x %8x", be32_to_cpu(se.ec_level),
- be32_to_cpu(se.valid));
- }
- printk("\n");
- }
-}
-
-/*
- * logfs_crash_dump - dump debug information to device
- *
- * The LogFS superblock only occupies part of a segment. This function will
- * write as much debug information as it can gather into the spare space.
- */
-void logfs_crash_dump(struct super_block *sb)
-{
- dump_segfile(sb);
-}
-
-/*
- * FIXME: There should be a reserve for root, similar to ext2.
- */
-int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
-{
- struct super_block *sb = dentry->d_sb;
- struct logfs_super *super = logfs_super(sb);
-
- stats->f_type = LOGFS_MAGIC_U32;
- stats->f_bsize = sb->s_blocksize;
- stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
- stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
- stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
- stats->f_files = 0;
- stats->f_ffree = 0;
- stats->f_namelen = LOGFS_MAX_NAMELEN;
- return 0;
-}
-
-static int logfs_sb_set(struct super_block *sb, void *_super)
-{
- struct logfs_super *super = _super;
-
- sb->s_fs_info = super;
- sb->s_mtd = super->s_mtd;
- sb->s_bdev = super->s_bdev;
-#ifdef CONFIG_BLOCK
- if (sb->s_bdev)
- sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
-#endif
-#ifdef CONFIG_MTD
- if (sb->s_mtd)
- sb->s_bdi = sb->s_mtd->backing_dev_info;
-#endif
- return 0;
-}
-
-static int logfs_sb_test(struct super_block *sb, void *_super)
-{
- struct logfs_super *super = _super;
- struct mtd_info *mtd = super->s_mtd;
-
- if (mtd && sb->s_mtd == mtd)
- return 1;
- if (super->s_bdev && sb->s_bdev == super->s_bdev)
- return 1;
- return 0;
-}
-
-static void set_segment_header(struct logfs_segment_header *sh, u8 type,
- u8 level, u32 segno, u32 ec)
-{
- sh->pad = 0;
- sh->type = type;
- sh->level = level;
- sh->segno = cpu_to_be32(segno);
- sh->ec = cpu_to_be32(ec);
- sh->gec = cpu_to_be64(segno);
- sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
-}
-
-static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
- u32 segno, u32 ec)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_segment_header *sh = &ds->ds_sh;
- int i;
-
- memset(ds, 0, sizeof(*ds));
- set_segment_header(sh, SEG_SUPER, 0, segno, ec);
-
- ds->ds_ifile_levels = super->s_ifile_levels;
- ds->ds_iblock_levels = super->s_iblock_levels;
- ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
- ds->ds_segment_shift = super->s_segshift;
- ds->ds_block_shift = sb->s_blocksize_bits;
- ds->ds_write_shift = super->s_writeshift;
- ds->ds_filesystem_size = cpu_to_be64(super->s_size);
- ds->ds_segment_size = cpu_to_be32(super->s_segsize);
- ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
- ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
- ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
- ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
- ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
- ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
- ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
- journal_for_each(i)
- ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
- ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
- ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
- LOGFS_SEGMENT_HEADERSIZE + 12);
-}
-
-static int write_one_sb(struct super_block *sb,
- struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_disk_super *ds;
- struct logfs_segment_entry se;
- struct page *page;
- u64 ofs;
- u32 ec, segno;
- int err;
-
- page = find_sb(sb, &ofs);
- if (!page)
- return -EIO;
- ds = page_address(page);
- segno = seg_no(sb, ofs);
- logfs_get_segment_entry(sb, segno, &se);
- ec = be32_to_cpu(se.ec_level) >> 4;
- ec++;
- logfs_set_segment_erased(sb, segno, ec, 0);
- logfs_write_ds(sb, ds, segno, ec);
- err = super->s_devops->write_sb(sb, page);
- put_page(page);
- return err;
-}
-
-int logfs_write_sb(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int err;
-
- /* First superblock */
- err = write_one_sb(sb, super->s_devops->find_first_sb);
- if (err)
- return err;
-
- /* Last superblock */
- err = write_one_sb(sb, super->s_devops->find_last_sb);
- if (err)
- return err;
- return 0;
-}
-
-static int ds_cmp(const void *ds0, const void *ds1)
-{
- size_t len = sizeof(struct logfs_disk_super);
-
- /* We know the segment headers differ, so ignore them */
- len -= LOGFS_SEGMENT_HEADERSIZE;
- ds0 += LOGFS_SEGMENT_HEADERSIZE;
- ds1 += LOGFS_SEGMENT_HEADERSIZE;
- return memcmp(ds0, ds1, len);
-}
-
-static int logfs_recover_sb(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_disk_super _ds0, *ds0 = &_ds0;
- struct logfs_disk_super _ds1, *ds1 = &_ds1;
- int err, valid0, valid1;
-
- /* read first superblock */
- err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
- if (err)
- return err;
- /* read last superblock */
- err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
- if (err)
- return err;
- valid0 = logfs_check_ds(ds0) == 0;
- valid1 = logfs_check_ds(ds1) == 0;
-
- if (!valid0 && valid1) {
- printk(KERN_INFO"First superblock is invalid - fixing.\n");
- return write_one_sb(sb, super->s_devops->find_first_sb);
- }
- if (valid0 && !valid1) {
- printk(KERN_INFO"Last superblock is invalid - fixing.\n");
- return write_one_sb(sb, super->s_devops->find_last_sb);
- }
- if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
- printk(KERN_INFO"Superblocks don't match - fixing.\n");
- return logfs_write_sb(sb);
- }
- /* If neither is valid now, something's wrong. Didn't we properly
- * check them before?!? */
- BUG_ON(!valid0 && !valid1);
- return 0;
-}
-
-static int logfs_make_writeable(struct super_block *sb)
-{
- int err;
-
- err = logfs_open_segfile(sb);
- if (err)
- return err;
-
- /* Repair any broken superblock copies */
- err = logfs_recover_sb(sb);
- if (err)
- return err;
-
- /* Check areas for trailing unaccounted data */
- err = logfs_check_areas(sb);
- if (err)
- return err;
-
- /* Do one GC pass before any data gets dirtied */
- logfs_gc_pass(sb);
-
- /* after all initializations are done, replay the journal
- * for rw-mounts, if necessary */
- err = logfs_replay_journal(sb);
- if (err)
- return err;
-
- return 0;
-}
-
-static int logfs_get_sb_final(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct inode *rootdir;
- int err;
-
- /* root dir */
- rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
- if (IS_ERR(rootdir))
- goto fail;
-
- sb->s_root = d_make_root(rootdir);
- if (!sb->s_root)
- goto fail;
-
- /* at that point we know that ->put_super() will be called */
- super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
- if (!super->s_erase_page)
- return -ENOMEM;
- memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
-
- /* FIXME: check for read-only mounts */
- err = logfs_make_writeable(sb);
- if (err) {
- __free_page(super->s_erase_page);
- return err;
- }
-
- log_super("LogFS: Finished mounting\n");
- return 0;
-
-fail:
- iput(super->s_master_inode);
- iput(super->s_segfile_inode);
- iput(super->s_mapping_inode);
- return -EIO;
-}
-
-int logfs_check_ds(struct logfs_disk_super *ds)
-{
- struct logfs_segment_header *sh = &ds->ds_sh;
-
- if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
- return -EINVAL;
- if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
- return -EINVAL;
- if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
- LOGFS_SEGMENT_HEADERSIZE + 12))
- return -EINVAL;
- return 0;
-}
-
-static struct page *find_super_block(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct page *first, *last;
-
- first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
- if (!first || IS_ERR(first))
- return NULL;
- last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
- if (!last || IS_ERR(last)) {
- put_page(first);
- return NULL;
- }
-
- if (!logfs_check_ds(page_address(first))) {
- put_page(last);
- return first;
- }
-
- /* First one didn't work, try the second superblock */
- if (!logfs_check_ds(page_address(last))) {
- put_page(first);
- return last;
- }
-
- /* Neither worked, sorry folks */
- put_page(first);
- put_page(last);
- return NULL;
-}
-
-static int __logfs_read_sb(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct page *page;
- struct logfs_disk_super *ds;
- int i;
-
- page = find_super_block(sb);
- if (!page)
- return -EINVAL;
-
- ds = page_address(page);
- super->s_size = be64_to_cpu(ds->ds_filesystem_size);
- super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
- super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
- super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
- super->s_segsize = 1 << ds->ds_segment_shift;
- super->s_segmask = (1 << ds->ds_segment_shift) - 1;
- super->s_segshift = ds->ds_segment_shift;
- sb->s_blocksize = 1 << ds->ds_block_shift;
- sb->s_blocksize_bits = ds->ds_block_shift;
- super->s_writesize = 1 << ds->ds_write_shift;
- super->s_writeshift = ds->ds_write_shift;
- super->s_no_segs = super->s_size >> super->s_segshift;
- super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
- super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
- super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
- super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
- super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
-
- journal_for_each(i)
- super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
-
- super->s_ifile_levels = ds->ds_ifile_levels;
- super->s_iblock_levels = ds->ds_iblock_levels;
- super->s_data_levels = ds->ds_data_levels;
- super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
- + super->s_data_levels;
- put_page(page);
- return 0;
-}
-
-static int logfs_read_sb(struct super_block *sb, int read_only)
-{
- struct logfs_super *super = logfs_super(sb);
- int ret;
-
- super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
- if (!super->s_btree_pool)
- return -ENOMEM;
-
- btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
- btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
- btree_init_mempool32(&super->s_shadow_tree.segment_map,
- super->s_btree_pool);
-
- ret = logfs_init_mapping(sb);
- if (ret)
- return ret;
-
- ret = __logfs_read_sb(sb);
- if (ret)
- return ret;
-
- if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
- return -EIO;
- if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
- !read_only)
- return -EIO;
-
- ret = logfs_init_rw(sb);
- if (ret)
- return ret;
-
- ret = logfs_init_areas(sb);
- if (ret)
- return ret;
-
- ret = logfs_init_gc(sb);
- if (ret)
- return ret;
-
- ret = logfs_init_journal(sb);
- if (ret)
- return ret;
-
- return 0;
-}
-
-static void logfs_kill_sb(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- log_super("LogFS: Start unmounting\n");
- /* Alias entries slow down mount, so evict as many as possible */
- sync_filesystem(sb);
- logfs_write_anchor(sb);
- free_areas(sb);
-
- /*
- * From this point on alias entries are simply dropped - and any
- * writes to the object store are considered bugs.
- */
- log_super("LogFS: Now in shutdown\n");
- generic_shutdown_super(sb);
- super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
-
- BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
-
- logfs_cleanup_gc(sb);
- logfs_cleanup_journal(sb);
- logfs_cleanup_areas(sb);
- logfs_cleanup_rw(sb);
- if (super->s_erase_page)
- __free_page(super->s_erase_page);
- super->s_devops->put_device(super);
- logfs_mempool_destroy(super->s_btree_pool);
- logfs_mempool_destroy(super->s_alias_pool);
- kfree(super);
- log_super("LogFS: Finished unmounting\n");
-}
-
-static struct dentry *logfs_get_sb_device(struct logfs_super *super,
- struct file_system_type *type, int flags)
-{
- struct super_block *sb;
- int err = -ENOMEM;
- static int mount_count;
-
- log_super("LogFS: Start mount %x\n", mount_count++);
-
- err = -EINVAL;
- sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super);
- if (IS_ERR(sb)) {
- super->s_devops->put_device(super);
- kfree(super);
- return ERR_CAST(sb);
- }
-
- if (sb->s_root) {
- /* Device is already in use */
- super->s_devops->put_device(super);
- kfree(super);
- return dget(sb->s_root);
- }
-
- /*
- * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
- * only covers 16TB and the upper 8TB are used for indirect blocks.
- * On 64bit system we could bump up the limit, but that would make
- * the filesystem incompatible with 32bit systems.
- */
- sb->s_maxbytes = (1ull << 43) - 1;
- sb->s_max_links = LOGFS_LINK_MAX;
- sb->s_op = &logfs_super_operations;
-
- err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
- if (err)
- goto err1;
-
- sb->s_flags |= MS_ACTIVE;
- err = logfs_get_sb_final(sb);
- if (err) {
- deactivate_locked_super(sb);
- return ERR_PTR(err);
- }
- return dget(sb->s_root);
-
-err1:
- /* no ->s_root, no ->put_super() */
- iput(super->s_master_inode);
- iput(super->s_segfile_inode);
- iput(super->s_mapping_inode);
- deactivate_locked_super(sb);
- return ERR_PTR(err);
-}
-
-static struct dentry *logfs_mount(struct file_system_type *type, int flags,
- const char *devname, void *data)
-{
- ulong mtdnr;
- struct logfs_super *super;
- int err;
-
- super = kzalloc(sizeof(*super), GFP_KERNEL);
- if (!super)
- return ERR_PTR(-ENOMEM);
-
- mutex_init(&super->s_dirop_mutex);
- mutex_init(&super->s_object_alias_mutex);
- INIT_LIST_HEAD(&super->s_freeing_list);
-
- if (!devname)
- err = logfs_get_sb_bdev(super, type, devname);
- else if (strncmp(devname, "mtd", 3))
- err = logfs_get_sb_bdev(super, type, devname);
- else {
- char *garbage;
- mtdnr = simple_strtoul(devname+3, &garbage, 0);
- if (*garbage)
- err = -EINVAL;
- else
- err = logfs_get_sb_mtd(super, mtdnr);
- }
-
- if (err) {
- kfree(super);
- return ERR_PTR(err);
- }
-
- return logfs_get_sb_device(super, type, flags);
-}
-
-static struct file_system_type logfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "logfs",
- .mount = logfs_mount,
- .kill_sb = logfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
-
-};
-MODULE_ALIAS_FS("logfs");
-
-static int __init logfs_init(void)
-{
- int ret;
-
- emergency_page = alloc_pages(GFP_KERNEL, 0);
- if (!emergency_page)
- return -ENOMEM;
-
- ret = logfs_compr_init();
- if (ret)
- goto out1;
-
- ret = logfs_init_inode_cache();
- if (ret)
- goto out2;
-
- ret = register_filesystem(&logfs_fs_type);
- if (!ret)
- return 0;
- logfs_destroy_inode_cache();
-out2:
- logfs_compr_exit();
-out1:
- __free_pages(emergency_page, 0);
- return ret;
-}
-
-static void __exit logfs_exit(void)
-{
- unregister_filesystem(&logfs_fs_type);
- logfs_destroy_inode_cache();
- logfs_compr_exit();
- __free_pages(emergency_page, 0);
-}
-
-module_init(logfs_init);
-module_exit(logfs_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
-MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/mbcache.c b/fs/mbcache.c
index c5bd19ffa326..b19be429d655 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -29,7 +29,7 @@ struct mb_cache {
/* log2 of hash table size */
int c_bucket_bits;
/* Maximum entries in cache to avoid degrading hash too much */
- int c_max_entries;
+ unsigned long c_max_entries;
/* Protects c_list, c_entry_count */
spinlock_t c_list_lock;
struct list_head c_list;
@@ -43,7 +43,7 @@ struct mb_cache {
static struct kmem_cache *mb_entry_cache;
static unsigned long mb_cache_shrink(struct mb_cache *cache,
- unsigned int nr_to_scan);
+ unsigned long nr_to_scan);
static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
u32 key)
@@ -155,12 +155,12 @@ out:
}
/*
- * mb_cache_entry_find_first - find the first entry in cache with given key
+ * mb_cache_entry_find_first - find the first reusable entry with the given key
* @cache: cache where we should search
* @key: key to look for
*
- * Search in @cache for entry with key @key. Grabs reference to the first
- * entry found and returns the entry.
+ * Search in @cache for a reusable entry with key @key. Grabs reference to the
+ * first reusable entry found and returns the entry.
*/
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
u32 key)
@@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
EXPORT_SYMBOL(mb_cache_entry_find_first);
/*
- * mb_cache_entry_find_next - find next entry in cache with the same
+ * mb_cache_entry_find_next - find next reusable entry with the same key
* @cache: cache where we should search
* @entry: entry to start search from
*
- * Finds next entry in the hash chain which has the same key as @entry.
- * If @entry is unhashed (which can happen when deletion of entry races
- * with the search), finds the first entry in the hash chain. The function
- * drops reference to @entry and returns with a reference to the found entry.
+ * Finds next reusable entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races with the
+ * search), finds the first reusable entry in the hash chain. The function drops
+ * reference to @entry and returns with a reference to the found entry.
*/
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
struct mb_cache_entry *entry)
@@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
/* Shrink number of entries in cache */
static unsigned long mb_cache_shrink(struct mb_cache *cache,
- unsigned int nr_to_scan)
+ unsigned long nr_to_scan)
{
struct mb_cache_entry *entry;
struct hlist_bl_head *head;
- unsigned int shrunk = 0;
+ unsigned long shrunk = 0;
spin_lock(&cache->c_list_lock);
while (nr_to_scan-- && !list_empty(&cache->c_list)) {
@@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
struct mb_cache_entry, e_list);
if (entry->e_referenced) {
entry->e_referenced = 0;
- list_move_tail(&cache->c_list, &entry->e_list);
+ list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
list_del_init(&entry->e_list);
@@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
static unsigned long mb_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- int nr_to_scan = sc->nr_to_scan;
struct mb_cache *cache = container_of(shrink, struct mb_cache,
c_shrink);
- return mb_cache_shrink(cache, nr_to_scan);
+ return mb_cache_shrink(cache, sc->nr_to_scan);
}
/* We shrink 1/X of the cache when we have too many entries in it */
@@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work)
struct mb_cache *mb_cache_create(int bucket_bits)
{
struct mb_cache *cache;
- int bucket_count = 1 << bucket_bits;
- int i;
-
- if (!try_module_get(THIS_MODULE))
- return NULL;
+ unsigned long bucket_count = 1UL << bucket_bits;
+ unsigned long i;
cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
if (!cache)
@@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits)
return cache;
err_out:
- module_put(THIS_MODULE);
return NULL;
}
EXPORT_SYMBOL(mb_cache_create);
@@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache)
}
kfree(cache->c_hash);
kfree(cache);
- module_put(THIS_MODULE);
}
EXPORT_SYMBOL(mb_cache_destroy);
@@ -420,7 +414,8 @@ static int __init mbcache_init(void)
mb_entry_cache = kmem_cache_create("mbcache",
sizeof(struct mb_cache_entry), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- BUG_ON(!mb_entry_cache);
+ if (!mb_entry_cache)
+ return -ENOMEM;
return 0;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f975d667c539..e7d9bf86d975 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -434,7 +434,6 @@ static const struct address_space_operations minix_aops = {
};
static const struct inode_operations minix_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.getattr = minix_getattr,
};
diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..2c856fc47ae3 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -94,6 +94,12 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);
+static inline bool __path_is_mountpoint(const struct path *path)
+{
+ struct mount *m = __lookup_mnt(path->mnt, path->dentry);
+ return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
+}
+
extern void __detach_mounts(struct dentry *dentry);
static inline void detach_mounts(struct dentry *dentry)
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af0823a..28af984a3d96 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
+ clean_bdev_bh_alias(&map_bh);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
@@ -705,7 +704,7 @@ mpage_writepages(struct address_space *mapping,
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0);
+ REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
}
@@ -726,7 +725,7 @@ int mpage_writepage(struct page *page, get_block_t get_block,
int ret = __mpage_writepage(page, wbc, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0);
+ REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
return ret;
diff --git a/fs/namei.c b/fs/namei.c
index 5b4eed221530..ad74877e1442 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -37,7 +37,7 @@
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
#include "mount.h"
@@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
- ret = path->dentry->d_op->d_manage(path->dentry, false);
+ ret = path->dentry->d_op->d_manage(path, false);
if (ret < 0)
break;
}
@@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path)
}
EXPORT_SYMBOL(follow_down_one);
-static inline int managed_dentry_rcu(struct dentry *dentry)
+static inline int managed_dentry_rcu(const struct path *path)
{
- return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
- dentry->d_op->d_manage(dentry, true) : 0;
+ return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+ path->dentry->d_op->d_manage(path, true) : 0;
}
/*
@@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
- switch (managed_dentry_rcu(path->dentry)) {
+ switch (managed_dentry_rcu(path)) {
case -ECHILD:
default:
return false;
@@ -1392,8 +1392,7 @@ int follow_down(struct path *path)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
- ret = path->dentry->d_op->d_manage(
- path->dentry, false);
+ ret = path->dentry->d_op->d_manage(path, false);
if (ret < 0)
return ret == -EISDIR ? 0 : ret;
}
@@ -1725,30 +1724,35 @@ static int pick_link(struct nameidata *nd, struct path *link,
return 1;
}
+enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+
/*
* Do we need to follow links? We _really_ want to be able
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
-static inline int should_follow_link(struct nameidata *nd, struct path *link,
- int follow,
- struct inode *inode, unsigned seq)
+static inline int step_into(struct nameidata *nd, struct path *path,
+ int flags, struct inode *inode, unsigned seq)
{
- if (likely(!d_is_symlink(link->dentry)))
- return 0;
- if (!follow)
+ if (!(flags & WALK_MORE) && nd->depth)
+ put_link(nd);
+ if (likely(!d_is_symlink(path->dentry)) ||
+ !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+ /* not a symlink or should not follow */
+ path_to_nameidata(path, nd);
+ nd->inode = inode;
+ nd->seq = seq;
return 0;
+ }
/* make sure that d_is_symlink above matches inode */
if (nd->flags & LOOKUP_RCU) {
- if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ if (read_seqcount_retry(&path->dentry->d_seq, seq))
return -ECHILD;
}
- return pick_link(nd, link, inode, seq);
+ return pick_link(nd, path, inode, seq);
}
-enum {WALK_GET = 1, WALK_PUT = 2};
-
static int walk_component(struct nameidata *nd, int flags)
{
struct path path;
@@ -1762,7 +1766,7 @@ static int walk_component(struct nameidata *nd, int flags)
*/
if (unlikely(nd->last_type != LAST_NORM)) {
err = handle_dots(nd, nd->last_type);
- if (flags & WALK_PUT)
+ if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return err;
}
@@ -1789,15 +1793,7 @@ static int walk_component(struct nameidata *nd, int flags)
inode = d_backing_inode(path.dentry);
}
- if (flags & WALK_PUT)
- put_link(nd);
- err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
- if (unlikely(err))
- return err;
- path_to_nameidata(&path, nd);
- nd->inode = inode;
- nd->seq = seq;
- return 0;
+ return step_into(nd, &path, flags, inode, seq);
}
/*
@@ -2104,9 +2100,10 @@ OK:
if (!name)
return 0;
/* last component of nested symlink */
- err = walk_component(nd, WALK_GET | WALK_PUT);
+ err = walk_component(nd, WALK_FOLLOW);
} else {
- err = walk_component(nd, WALK_GET);
+ /* not the last component */
+ err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
if (err < 0)
return err;
@@ -2248,12 +2245,7 @@ static inline int lookup_last(struct nameidata *nd)
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
nd->flags &= ~LOOKUP_PARENT;
- return walk_component(nd,
- nd->flags & LOOKUP_FOLLOW
- ? nd->depth
- ? WALK_PUT | WALK_GET
- : WALK_GET
- : 0);
+ return walk_component(nd, 0);
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -2558,28 +2550,9 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
}
EXPORT_SYMBOL(user_path_at_empty);
-/*
- * NB: most callers don't do anything directly with the reference to the
- * to struct filename, but the nd->last pointer points into the name string
- * allocated by getname. So we must hold the reference to it until all
- * path-walking is complete.
- */
-static inline struct filename *
-user_path_parent(int dfd, const char __user *path,
- struct path *parent,
- struct qstr *last,
- int *type,
- unsigned int flags)
-{
- /* only LOOKUP_REVAL is allowed in extra flags */
- return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
- parent, last, type);
-}
-
/**
* mountpoint_last - look up last component for umount
* @nd: pathwalk nameidata - currently pointing at parent directory of "last"
- * @path: pointer to container for result
*
* This is a special lookup_last function just for umount. In this case, we
* need to resolve the path without doing any revalidation.
@@ -2592,23 +2565,20 @@ user_path_parent(int dfd, const char __user *path,
*
* Returns:
* -error: if there was an error during lookup. This includes -ENOENT if the
- * lookup found a negative dentry. The nd->path reference will also be
- * put in this case.
+ * lookup found a negative dentry.
*
- * 0: if we successfully resolved nd->path and found it to not to be a
- * symlink that needs to be followed. "path" will also be populated.
- * The nd->path reference will also be put.
+ * 0: if we successfully resolved nd->last and found it to not to be a
+ * symlink that needs to be followed.
*
* 1: if we successfully resolved nd->last and found it to be a symlink
- * that needs to be followed. "path" will be populated with the path
- * to the link, and nd->path will *not* be put.
+ * that needs to be followed.
*/
static int
-mountpoint_last(struct nameidata *nd, struct path *path)
+mountpoint_last(struct nameidata *nd)
{
int error = 0;
- struct dentry *dentry;
struct dentry *dir = nd->path.dentry;
+ struct path path;
/* If we're in rcuwalk, drop out of it to handle last component */
if (nd->flags & LOOKUP_RCU) {
@@ -2622,37 +2592,28 @@ mountpoint_last(struct nameidata *nd, struct path *path)
error = handle_dots(nd, nd->last_type);
if (error)
return error;
- dentry = dget(nd->path.dentry);
+ path.dentry = dget(nd->path.dentry);
} else {
- dentry = d_lookup(dir, &nd->last);
- if (!dentry) {
+ path.dentry = d_lookup(dir, &nd->last);
+ if (!path.dentry) {
/*
* No cached dentry. Mounted dentries are pinned in the
* cache, so that means that this dentry is probably
* a symlink or the path doesn't actually point
* to a mounted dentry.
*/
- dentry = lookup_slow(&nd->last, dir,
+ path.dentry = lookup_slow(&nd->last, dir,
nd->flags | LOOKUP_NO_REVAL);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ if (IS_ERR(path.dentry))
+ return PTR_ERR(path.dentry);
}
}
- if (d_is_negative(dentry)) {
- dput(dentry);
+ if (d_is_negative(path.dentry)) {
+ dput(path.dentry);
return -ENOENT;
}
- if (nd->depth)
- put_link(nd);
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
- d_backing_inode(dentry), 0);
- if (unlikely(error))
- return error;
- mntget(path->mnt);
- follow_mount(path);
- return 0;
+ path.mnt = nd->path.mnt;
+ return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
}
/**
@@ -2672,13 +2633,19 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
if (IS_ERR(s))
return PTR_ERR(s);
while (!(err = link_path_walk(s, nd)) &&
- (err = mountpoint_last(nd, path)) > 0) {
+ (err = mountpoint_last(nd)) > 0) {
s = trailing_symlink(nd);
if (IS_ERR(s)) {
err = PTR_ERR(s);
break;
}
}
+ if (!err) {
+ *path = nd->path;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
+ follow_mount(path);
+ }
terminate_walk(nd);
return err;
}
@@ -2895,7 +2862,7 @@ bool may_open_dev(const struct path *path)
!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
-static int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(const struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
struct inode *inode = dentry->d_inode;
@@ -2945,7 +2912,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
static int handle_truncate(struct file *filp)
{
- struct path *path = &filp->f_path;
+ const struct path *path = &filp->f_path;
struct inode *inode = path->dentry->d_inode;
int error = get_write_access(inode);
if (error)
@@ -3335,18 +3302,11 @@ static int do_last(struct nameidata *nd,
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
- if (nd->depth)
- put_link(nd);
- error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
- inode, seq);
+ error = step_into(nd, &path, 0, inode, seq);
if (unlikely(error))
return error;
-
- path_to_nameidata(&path, nd);
- nd->inode = inode;
- nd->seq = seq;
- /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
+ /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
if (error)
return error;
@@ -3861,8 +3821,8 @@ static long do_rmdir(int dfd, const char __user *pathname)
int type;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname,
- &path, &last, &type, lookup_flags);
+ name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ &path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -3991,8 +3951,8 @@ static long do_unlinkat(int dfd, const char __user *pathname)
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname,
- &path, &last, &type, lookup_flags);
+ name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ &path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -4345,11 +4305,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- /*
- * Check source == target.
- * On overlayfs need to look at underlying inodes.
- */
- if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
+ if (source == target)
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
@@ -4491,15 +4447,15 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
target_flags = 0;
retry:
- from = user_path_parent(olddfd, oldname,
- &old_path, &old_last, &old_type, lookup_flags);
+ from = filename_parentat(olddfd, getname(oldname), lookup_flags,
+ &old_path, &old_last, &old_type);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
}
- to = user_path_parent(newdfd, newname,
- &new_path, &new_last, &new_type, lookup_flags);
+ to = filename_parentat(newdfd, getname(newname), lookup_flags,
+ &new_path, &new_last, &new_type);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
@@ -4650,7 +4606,8 @@ out:
* have ->get_link() not calling nd_jump_link(). Using (or not using) it
* for any given inode is up to filesystem.
*/
-int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+static int generic_readlink(struct dentry *dentry, char __user *buffer,
+ int buflen)
{
DEFINE_DELAYED_CALL(done);
struct inode *inode = d_inode(dentry);
@@ -4666,7 +4623,36 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
do_delayed_call(&done);
return res;
}
-EXPORT_SYMBOL(generic_readlink);
+
+/**
+ * vfs_readlink - copy symlink body into userspace buffer
+ * @dentry: dentry on which to get symbolic link
+ * @buffer: user memory pointer
+ * @buflen: size of buffer
+ *
+ * Does not touch atime. That's up to the caller if necessary
+ *
+ * Does not call security hook.
+ */
+int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
+ if (unlikely(inode->i_op->readlink))
+ return inode->i_op->readlink(dentry, buffer, buflen);
+
+ if (!d_is_symlink(dentry))
+ return -EINVAL;
+
+ spin_lock(&inode->i_lock);
+ inode->i_opflags |= IOP_DEFAULT_READLINK;
+ spin_unlock(&inode->i_lock);
+ }
+
+ return generic_readlink(dentry, buffer, buflen);
+}
+EXPORT_SYMBOL(vfs_readlink);
/**
* vfs_get_link - get symlink body
@@ -4783,7 +4769,6 @@ int page_symlink(struct inode *inode, const char *symname, int len)
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..487ba30bb5c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -96,10 +96,6 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
return &mountpoint_hashtable[tmp & mp_hash_mask];
}
-/*
- * allocation is serialized by namespace_sem, but we need the spinlock to
- * serialize with freeing.
- */
static int mnt_alloc_id(struct mount *mnt)
{
int res;
@@ -678,7 +674,7 @@ out:
*
* lookup_mnt takes a reference to the found vfsmount.
*/
-struct vfsmount *lookup_mnt(struct path *path)
+struct vfsmount *lookup_mnt(const struct path *path)
{
struct mount *child_mnt;
struct vfsmount *m;
@@ -746,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
return NULL;
}
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
- struct hlist_head *chain = mp_hash(dentry);
- struct mountpoint *mp;
+ struct mountpoint *mp, *new = NULL;
int ret;
- mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
- if (!mp)
+ if (d_mountpoint(dentry)) {
+mountpoint:
+ read_seqlock_excl(&mount_lock);
+ mp = lookup_mountpoint(dentry);
+ read_sequnlock_excl(&mount_lock);
+ if (mp)
+ goto done;
+ }
+
+ if (!new)
+ new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+ if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* Exactly one processes may set d_mounted */
ret = d_set_mounted(dentry);
- if (ret) {
- kfree(mp);
- return ERR_PTR(ret);
- }
- mp->m_dentry = dentry;
- mp->m_count = 1;
- hlist_add_head(&mp->m_hash, chain);
- INIT_HLIST_HEAD(&mp->m_list);
+ /* Someone else set d_mounted? */
+ if (ret == -EBUSY)
+ goto mountpoint;
+
+ /* The dentry is not available as a mountpoint? */
+ mp = ERR_PTR(ret);
+ if (ret)
+ goto done;
+
+ /* Add the new mountpoint to the hash table */
+ read_seqlock_excl(&mount_lock);
+ new->m_dentry = dentry;
+ new->m_count = 1;
+ hlist_add_head(&new->m_hash, mp_hash(dentry));
+ INIT_HLIST_HEAD(&new->m_list);
+ read_sequnlock_excl(&mount_lock);
+
+ mp = new;
+ new = NULL;
+done:
+ kfree(new);
return mp;
}
@@ -1034,6 +1054,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (IS_MNT_SLAVE(old))
list_add(&mnt->mnt_slave, &old->mnt_slave);
mnt->mnt_master = old->mnt_master;
+ } else {
+ CLEAR_MNT_SHARED(mnt);
}
if (flag & CL_MAKE_SHARED)
set_mnt_shared(mnt);
@@ -1159,7 +1181,36 @@ struct vfsmount *mntget(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntget);
-struct vfsmount *mnt_clone_internal(struct path *path)
+/* path_is_mountpoint() - Check if path is a mount in the current
+ * namespace.
+ *
+ * d_mountpoint() can only be used reliably to establish if a dentry is
+ * not mounted in any namespace and that common case is handled inline.
+ * d_mountpoint() isn't aware of the possibility there may be multiple
+ * mounts using a given dentry in a different namespace. This function
+ * checks if the passed in path is a mountpoint rather than the dentry
+ * alone.
+ */
+bool path_is_mountpoint(const struct path *path)
+{
+ unsigned seq;
+ bool res;
+
+ if (!d_mountpoint(path->dentry))
+ return false;
+
+ rcu_read_lock();
+ do {
+ seq = read_seqbegin(&mount_lock);
+ res = __path_is_mountpoint(path);
+ } while (read_seqretry(&mount_lock, seq));
+ rcu_read_unlock();
+
+ return res;
+}
+EXPORT_SYMBOL(path_is_mountpoint);
+
+struct vfsmount *mnt_clone_internal(const struct path *path)
{
struct mount *p;
p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
@@ -1568,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry)
struct mount *mnt;
namespace_lock();
+ lock_mount_hash();
mp = lookup_mountpoint(dentry);
if (IS_ERR_OR_NULL(mp))
goto out_unlock;
- lock_mount_hash();
event++;
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1582,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry)
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
- unlock_mount_hash();
put_mountpoint(mp);
out_unlock:
+ unlock_mount_hash();
namespace_unlock();
}
@@ -1758,7 +1809,7 @@ out:
/* Caller should check returned pointer for errors */
-struct vfsmount *collect_mounts(struct path *path)
+struct vfsmount *collect_mounts(const struct path *path)
{
struct mount *tree;
namespace_lock();
@@ -1791,7 +1842,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
*
* Release with mntput().
*/
-struct vfsmount *clone_private_mount(struct path *path)
+struct vfsmount *clone_private_mount(const struct path *path)
{
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
@@ -1799,9 +1850,7 @@ struct vfsmount *clone_private_mount(struct path *path)
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
- down_read(&namespace_sem);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
- up_read(&namespace_sem);
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
@@ -2013,9 +2062,7 @@ retry:
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) {
- struct mountpoint *mp = lookup_mountpoint(dentry);
- if (!mp)
- mp = new_mountpoint(dentry);
+ struct mountpoint *mp = get_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
@@ -2034,7 +2081,11 @@ retry:
static void unlock_mount(struct mountpoint *where)
{
struct dentry *dentry = where->m_dentry;
+
+ read_seqlock_excl(&mount_lock);
put_mountpoint(where);
+ read_sequnlock_excl(&mount_lock);
+
namespace_unlock();
inode_unlock(dentry->d_inode);
}
@@ -2997,7 +3048,7 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}
-bool path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(const struct path *path1, const struct path *path2)
{
bool res;
read_seqlock_excl(&mount_lock);
@@ -3110,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
+ put_mountpoint(root_mp);
unlock_mount_hash();
chroot_fs_refs(&root, &new);
- put_mountpoint(root_mp);
error = 0;
out4:
unlock_mount(old_mp);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6df2a3827574..088f52484d6e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -18,7 +18,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/byteorder.h>
#include "ncp_fs.h"
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index dd38ca1f2ecb..76965e772264 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -8,7 +8,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/time.h>
#include <linux/kernel.h>
@@ -203,7 +203,7 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
bufsize - (pos % bufsize),
iov_iter_count(from));
- if (copy_from_iter(bouncebuffer, to_write, from) != to_write) {
+ if (!copy_from_iter_full(bouncebuffer, to_write, from)) {
errno = -EFAULT;
break;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index f6cf4c7e92b1..7eb89c23c847 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -13,7 +13,7 @@
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/byteorder.h>
#include <linux/time.h>
@@ -243,7 +243,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
static const struct inode_operations ncp_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = ncp_notify_change,
};
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0a3f9b594602..4434e4977cf3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,7 +20,7 @@
#include <linux/vmalloc.h>
#include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "ncp_fs.h"
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 33b873b259a8..39f57bef8531 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -18,7 +18,7 @@
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "ncp_fs.h"
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 17cfb743b5bf..b4c87cfcee95 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -21,7 +21,7 @@
#include <linux/fcntl.h>
#include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <asm/string.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 471bc3d1139e..f32f272ee501 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -16,7 +16,7 @@
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/in.h>
#include <linux/net.h>
#include <linux/mm.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 421b6f91e8ec..a6d26b46fc05 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -21,7 +21,7 @@
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/fs.h>
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e9aa235e9d10..f073a6d2c6a5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -110,20 +110,52 @@ out:
#if defined(CONFIG_NFS_V4_1)
/*
- * Lookup a layout by filehandle.
+ * Lookup a layout inode by stateid
*
- * Note: gets a refcount on the layout hdr and on its respective inode.
- * Caller must put the layout hdr and the inode.
+ * Note: returns a refcount on the inode and superblock
+ */
+static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_server *server;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (stateid != NULL &&
+ !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+ continue;
+ inode = igrab(lo->plh_inode);
+ if (!inode)
+ continue;
+ if (!nfs_sb_active(inode->i_sb)) {
+ rcu_read_lock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ spin_lock(&clp->cl_lock);
+ goto restart;
+ }
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Lookup a layout inode by filehandle.
+ *
+ * Note: returns a refcount on the inode and superblock
*
- * TODO: keep track of all layouts (and delegations) in a hash table
- * hashed by filehandle.
*/
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
- struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
+ const struct nfs_fh *fh)
{
struct nfs_server *server;
struct nfs_inode *nfsi;
- struct inode *ino;
+ struct inode *inode;
struct pnfs_layout_hdr *lo;
restart:
@@ -134,37 +166,38 @@ restart:
continue;
if (nfsi->layout != lo)
continue;
- ino = igrab(lo->plh_inode);
- if (!ino)
- break;
- spin_lock(&ino->i_lock);
- /* Is this layout in the process of being freed? */
- if (nfsi->layout != lo) {
- spin_unlock(&ino->i_lock);
- iput(ino);
+ inode = igrab(lo->plh_inode);
+ if (!inode)
+ continue;
+ if (!nfs_sb_active(inode->i_sb)) {
+ rcu_read_lock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ spin_lock(&clp->cl_lock);
goto restart;
}
- pnfs_get_layout_hdr(lo);
- spin_unlock(&ino->i_lock);
- return lo;
+ return inode;
}
}
return NULL;
}
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
- struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fh,
+ const nfs4_stateid *stateid)
{
- struct pnfs_layout_hdr *lo;
+ struct inode *inode;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh);
+ inode = nfs_layout_find_inode_by_stateid(clp, stateid);
+ if (!inode)
+ inode = nfs_layout_find_inode_by_fh(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- return lo;
+ return inode;
}
/*
@@ -213,18 +246,20 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh);
- if (!lo) {
- trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
- &args->cbl_stateid, -rv);
+ ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
+ if (!ino)
goto out;
- }
- ino = lo->plh_inode;
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ goto out;
+ }
+ pnfs_get_layout_hdr(lo);
rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
if (rv != NFS_OK)
goto unlock;
@@ -258,10 +293,10 @@ unlock:
/* Free all lsegs that are attached to commit buckets */
nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
+out:
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
&args->cbl_stateid, -rv);
- iput(ino);
-out:
+ nfs_iput_and_deactive(ino);
return rv;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ebecfb8fba06..91a8d610ba0f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -369,9 +369,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
* Look up a client by IP address and protocol version
* - creates a new record if one doesn't yet exist
*/
-struct nfs_client *
-nfs_get_client(const struct nfs_client_initdata *cl_init,
- rpc_authflavor_t authflavour)
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
{
struct nfs_client *clp, *new = NULL;
struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
@@ -655,7 +653,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
+ clp = nfs_get_client(&cl_init);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dff600ae0d74..d7df5e67b0c1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -391,10 +391,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
- /* Ensure we revalidate the attributes and page cache! */
- spin_lock(&inode->i_lock);
- nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
- spin_unlock(&inode->i_lock);
trace_nfs4_set_delegation(inode, res->delegation_type);
out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5f1af4cd1a33..fad81041f5ab 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -455,14 +455,17 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
}
/*
- * This function is called by the lookup code to request the use of
- * readdirplus to accelerate any future lookups in the same
+ * This function is called by the lookup and getattr code to request the
+ * use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-static
void nfs_advise_use_readdirplus(struct inode *dir)
{
- set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files))
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
}
/*
@@ -475,9 +478,12 @@ void nfs_advise_use_readdirplus(struct inode *dir)
*/
void nfs_force_use_readdirplus(struct inode *dir)
{
- if (!list_empty(&NFS_I(dir)->open_files)) {
- nfs_advise_use_readdirplus(dir);
- nfs_zap_mapping(dir, dir->i_mapping);
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files)) {
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ invalidate_mapping_pages(dir->i_mapping, 0, -1);
}
}
@@ -886,17 +892,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
goto out;
}
-static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
-{
- struct nfs_inode *nfsi = NFS_I(dir);
-
- if (nfs_attribute_cache_expired(dir))
- return true;
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- return true;
- return false;
-}
-
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -928,7 +923,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
- if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
+ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1035,8 +1030,6 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
int rcu_walk)
{
- int ret;
-
if (IS_ROOT(dentry))
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -1044,12 +1037,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
- if (rcu_walk)
- ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
- else
- ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
- if (ret < 0)
- return 0;
+ if (nfs_mapping_need_revalidate_inode(dir)) {
+ if (rcu_walk)
+ return 0;
+ if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ return 0;
+ }
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
return 1;
@@ -1161,7 +1154,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
goto out_bad;
}
- goto out_valid_noent;
+ goto out_valid;
}
if (is_bad_inode(inode)) {
@@ -1184,6 +1177,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
goto out_zap_parent;
}
+ nfs_advise_use_readdirplus(dir);
goto out_valid;
}
@@ -1219,12 +1213,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
- /* Success: notify readdir to use READDIRPLUS */
- nfs_advise_use_readdirplus(dir);
- out_valid_noent:
if (flags & LOOKUP_RCU) {
if (parent != ACCESS_ONCE(dentry->d_parent))
return -ECHILD;
@@ -1279,8 +1273,8 @@ out_error:
*/
static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
{
- int error;
struct inode *inode = d_inode(dentry);
+ int error = 0;
/*
* I believe we can only get a negative dentry here in the case of a
@@ -1299,7 +1293,8 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
- error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (nfs_mapping_need_revalidate_inode(inode))
+ error = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
__func__, inode->i_ino, error ? "invalid" : "valid");
return !error;
@@ -1424,8 +1419,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(res))
goto out_label;
- /* Success: notify readdir to use READDIRPLUS */
- nfs_advise_use_readdirplus(dir);
+ /* Notify readdir to use READDIRPLUS */
+ nfs_force_use_readdirplus(dir);
no_entry:
res = d_splice_alias(inode, dentry);
@@ -1467,9 +1462,9 @@ static fmode_t flags_to_mode(int flags)
return res;
}
-static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
- return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
}
static int do_open(struct inode *inode, struct file *filp)
@@ -1535,8 +1530,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (open_flags & O_CREAT) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ mode &= ~current_umask();
+
attr.ia_valid |= ATTR_MODE;
- attr.ia_mode = mode & ~current_umask();
+ attr.ia_mode = mode;
}
if (open_flags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
@@ -1554,7 +1554,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
return finish_no_open(file, dentry);
}
- ctx = create_nfs_open_context(dentry, open_flags);
+ ctx = create_nfs_open_context(dentry, open_flags, file);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -2286,8 +2286,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
if (cache == NULL)
goto out;
/* Found an entry, is our attribute cache valid? */
- if (!nfs_attribute_cache_expired(inode) &&
- !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
break;
err = -ECHILD;
if (!may_block)
@@ -2335,12 +2334,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
cache = NULL;
if (cache == NULL)
goto out;
- err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
- if (err)
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
goto out;
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
+ err = 0;
out:
rcu_read_unlock();
return err;
@@ -2492,12 +2491,13 @@ EXPORT_SYMBOL_GPL(nfs_may_open);
static int nfs_execute_ok(struct inode *inode, int mask)
{
struct nfs_server *server = NFS_SERVER(inode);
- int ret;
+ int ret = 0;
- if (mask & MAY_NOT_BLOCK)
- ret = nfs_revalidate_inode_rcu(server, inode);
- else
- ret = nfs_revalidate_inode(server, inode);
+ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) {
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+ ret = __nfs_revalidate_inode(server, inode);
+ }
if (ret == 0 && !execute_ok(inode))
ret = -EACCES;
return ret;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bd81bcf3ffcf..aab32fc3d6a8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -52,7 +52,7 @@
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/atomic.h>
#include "internal.h"
@@ -105,7 +105,7 @@ struct nfs_direct_req {
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
static void nfs_direct_write_schedule_work(struct work_struct *work);
static inline void get_dreq(struct nfs_direct_req *dreq)
@@ -684,7 +684,7 @@ out_failed:
}
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, dreq->inode);
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
@@ -717,7 +717,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
}
if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_direct_write_complete(dreq, data->inode);
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
@@ -768,7 +768,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
}
}
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
{
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
@@ -824,7 +824,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
out_put:
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, hdr->inode);
+ nfs_direct_write_complete(dreq);
hdr->release(hdr);
}
@@ -953,7 +953,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, dreq->inode);
+ nfs_direct_write_complete(dreq);
return 0;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9ea85ae23c32..26dbe8b0c10d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -29,7 +29,7 @@
#include <linux/gfp.h>
#include <linux/swap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "delegation.h"
#include "internal.h"
@@ -101,18 +101,11 @@ EXPORT_SYMBOL_GPL(nfs_file_release);
static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_inode *nfsi = NFS_I(inode);
-
- if (nfs_have_delegated_attributes(inode))
- goto out_noreval;
if (filp->f_flags & O_DIRECT)
goto force_reval;
- if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
- goto force_reval;
- if (nfs_attribute_timeout(inode))
+ if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
goto force_reval;
-out_noreval:
return 0;
force_reval:
return __nfs_revalidate_inode(server, inode);
@@ -374,7 +367,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
*/
if (!PageUptodate(page)) {
unsigned pglen = nfs_page_length(page);
- unsigned end = offset + len;
+ unsigned end = offset + copied;
if (pglen == 0) {
zero_user_segments(page, 0, offset,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4946ef40ba87..f956ca20a8a3 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -279,11 +279,11 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
dataserver_retrans, 4,
- s->nfs_client->cl_minorversion,
- s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+ s->nfs_client->cl_minorversion);
out_test_devid:
- if (filelayout_test_devid_unavailable(devid))
+ if (ret->ds_clp == NULL ||
+ filelayout_test_devid_unavailable(devid))
ret = NULL;
out:
return ret;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 98ace127bf86..0ca4af8cca5d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -25,9 +25,20 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+#define FF_LAYOUTRETURN_MAXERR 20
+
static struct group_info *ff_zero_group;
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr);
+static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit);
+static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror);
+
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
@@ -172,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
- if (mirror->mirror_ds != pos->mirror_ds)
+ if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
@@ -349,19 +360,6 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
-static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
-{
- struct nfs4_deviceid_node *node;
- int i;
-
- if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
- return;
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- node = &fls->mirror_array[i]->mirror_ds->id_node;
- clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
- }
-}
-
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -415,8 +413,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
- struct nfs4_deviceid devid;
- struct nfs4_deviceid_node *idnode;
struct auth_cred acred = { .group_info = ff_zero_group };
struct rpc_cred __rcu *cred;
u32 ds_count, fh_count, id;
@@ -441,24 +437,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->ds_count = ds_count;
/* deviceid */
- rc = decode_deviceid(&stream, &devid);
+ rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
if (rc)
goto out_err_free;
- idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
- &devid, lh->plh_lc_cred,
- gfp_flags);
- /*
- * upon success, mirror_ds is allocated by previous
- * getdeviceinfo, or newly by .alloc_deviceid_node
- * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
- */
- if (idnode)
- fls->mirror_array[i]->mirror_ds =
- FF_LAYOUT_MIRROR_DS(idnode);
- else
- goto out_err_free;
-
/* efficiency */
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
@@ -556,8 +538,6 @@ out_sort_mirrors:
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
- ff_layout_mark_devices_valid(fls);
-
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
@@ -639,12 +619,11 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
struct nfs4_ff_layoutstat *layoutstat,
ktime_t now)
{
- static const ktime_t notime = {0};
s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
- if (ktime_equal(mirror->start_time, notime))
+ if (!mirror->start_time)
mirror->start_time = now;
if (mirror->report_interval != 0)
report_interval = (s64)mirror->report_interval * 1000LL;
@@ -702,6 +681,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
if (report)
@@ -718,6 +698,7 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
@@ -731,6 +712,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
if (report)
@@ -750,6 +732,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
requested, completed, ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
@@ -1142,7 +1125,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
case -EPIPE:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
- nfs4_mark_deviceid_unavailable(devid);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -1191,7 +1175,8 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
- nfs4_mark_deviceid_unavailable(devid);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+ &devid->deviceid);
}
/* FIXME: Need to prevent infinite looping here. */
return -NFS4ERR_RESET_TO_PNFS;
@@ -1293,6 +1278,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
+ ff_layout_read_record_layoutstats_done(task, hdr);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1961,38 +1947,88 @@ ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
id_node));
}
-static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
+static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ const struct nfs4_flexfile_layoutreturn_args *ff_args)
{
- struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
__be32 *start;
- int count = 0, ret = 0;
start = xdr_reserve_space(xdr, 4);
if (unlikely(!start))
return -E2BIG;
+ *start = cpu_to_be32(ff_args->num_errors);
/* This assume we always return _ALL_ layouts */
- spin_lock(&hdr->plh_inode->i_lock);
- ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
- spin_unlock(&hdr->plh_inode->i_lock);
+ return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
+}
- *start = cpu_to_be32(count);
+static void
+encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ __be32 *p;
- return ret;
+ p = xdr_reserve_space(xdr, len);
+ xdr_encode_opaque_fixed(p, buf, len);
+}
+
+static void
+ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_reserve_space(xdr, 4*8);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
+}
+
+static void
+ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo,
+ devinfo->ld_private.data);
}
/* report nothing for now */
-static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
+static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct nfs4_flexfile_layoutreturn_args *ff_args)
{
__be32 *p;
+ int i;
p = xdr_reserve_space(xdr, 4);
- if (likely(p))
- *p = cpu_to_be32(0);
+ *p = cpu_to_be32(ff_args->num_dev);
+ for (i = 0; i < ff_args->num_dev; i++)
+ ff_layout_encode_ff_iostat(xdr,
+ &args->layout->plh_stateid,
+ &ff_args->devinfo[i]);
+}
+
+static void
+ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
+ unsigned int num_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_entries; i++) {
+ if (!devinfo[i].ld_private.ops)
+ continue;
+ if (!devinfo[i].ld_private.ops->free)
+ continue;
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
}
static struct nfs4_deviceid_node *
@@ -2008,24 +2044,91 @@ ff_layout_alloc_deviceid_node(struct nfs_server *server,
}
static void
-ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
-{
- struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
+ const void *voidargs,
+ const struct nfs4_xdr_opaque_data *ff_opaque)
+{
+ const struct nfs4_layoutreturn_args *args = voidargs;
+ struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
+ struct xdr_buf tmp_buf = {
+ .head = {
+ [0] = {
+ .iov_base = page_address(ff_args->pages[0]),
+ },
+ },
+ .buflen = PAGE_SIZE,
+ };
+ struct xdr_stream tmp_xdr;
__be32 *start;
dprintk("%s: Begin\n", __func__);
- start = xdr_reserve_space(xdr, 4);
- BUG_ON(!start);
- ff_layout_encode_ioerr(flo, xdr, args);
- ff_layout_encode_iostats(flo, xdr, args);
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+
+ ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
+ ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
+
+ start = xdr_reserve_space(xdr, 4);
+ *start = cpu_to_be32(tmp_buf.len);
+ xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
+static void
+ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+
+ if (!args->data)
+ return;
+ ff_args = args->data;
+ args->data = NULL;
+
+ ff_layout_free_ds_ioerr(&ff_args->errors);
+ ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
+
+ put_page(ff_args->pages[0]);
+ kfree(ff_args);
+}
+
+const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
+ .encode = ff_layout_encode_layoutreturn,
+ .free = ff_layout_free_layoutreturn,
+};
+
+static int
+ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
+
+ ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+ if (!ff_args)
+ goto out_nomem;
+ ff_args->pages[0] = alloc_page(GFP_KERNEL);
+ if (!ff_args->pages[0])
+ goto out_nomem_free;
+
+ INIT_LIST_HEAD(&ff_args->errors);
+ ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
+ &args->range, &ff_args->errors,
+ FF_LAYOUTRETURN_MAXERR);
+
+ spin_lock(&args->inode->i_lock);
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+ spin_unlock(&args->inode->i_lock);
+
+ args->ld_private->ops = &layoutreturn_ops;
+ args->ld_private->data = ff_args;
+ return 0;
+out_nomem_free:
+ kfree(ff_args);
+out_nomem:
+ return -ENOMEM;
+}
+
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
@@ -2146,21 +2249,18 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
}
static void
-ff_layout_encode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_args *args,
- struct nfs42_layoutstat_devinfo *devinfo)
+ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror)
{
- struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
struct nfs_fh *fh = &mirror->fh_versions[0];
- __be32 *p, *start;
+ __be32 *p;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
dprintk("%s: DS %s: encoding address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- /* layoutupdate length */
- start = xdr_reserve_space(xdr, 4);
/* netaddr4 */
ff_layout_encode_netaddr(xdr, da);
/* nfs_fh4 */
@@ -2177,42 +2277,71 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
+ const struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
+ struct nfs42_layoutstat_devinfo, ld_private);
+ __be32 *start;
+
+ /* layoutupdate length */
+ start = xdr_reserve_space(xdr, 4);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
+static void
+ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs4_ff_layout_mirror *mirror = opaque->data;
+
+ ff_layout_put_mirror(mirror);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
+ .encode = ff_layout_encode_layoutstats,
+ .free = ff_layout_free_layoutstats,
+};
+
static int
-ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
- struct pnfs_layout_hdr *lo,
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
int dev_limit)
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
- struct nfs42_layoutstat_devinfo *devinfo;
int i = 0;
list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
if (i >= dev_limit)
break;
- if (!mirror->mirror_ds)
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
continue;
/* mirror refcount put in cleanup_layoutstats */
if (!atomic_inc_not_zero(&mirror->ref))
continue;
dev = &mirror->mirror_ds->id_node;
- devinfo = &args->devinfo[i];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
devinfo->offset = 0;
devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
devinfo->layout_type = LAYOUT_FLEX_FILES;
- devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
- devinfo->layout_private = mirror;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = mirror;
+ devinfo++;
i++;
}
return i;
@@ -2222,47 +2351,27 @@ static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
struct nfs4_flexfile_layout *ff_layout;
- struct nfs4_ff_layout_mirror *mirror;
- int dev_count = 0;
+ const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
- spin_lock(&args->inode->i_lock);
- ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
- list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
- if (atomic_read(&mirror->ref) != 0)
- dev_count ++;
- }
- spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
- if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
- dprintk("%s: truncating devinfo to limit (%d:%d)\n",
- __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
- dev_count = PNFS_LAYOUTSTATS_MAXDEV;
- }
args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
if (!args->devinfo)
return -ENOMEM;
spin_lock(&args->inode->i_lock);
- args->num_dev = ff_layout_mirror_prepare_stats(args,
- &ff_layout->generic_hdr, dev_count);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &args->devinfo[0], dev_count);
spin_unlock(&args->inode->i_lock);
+ if (!args->num_dev) {
+ kfree(args->devinfo);
+ args->devinfo = NULL;
+ return -ENOENT;
+ }
return 0;
}
-static void
-ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
-{
- struct nfs4_ff_layout_mirror *mirror;
- int i;
-
- for (i = 0; i < data->args.num_dev; i++) {
- mirror = data->args.devinfo[i].layout_private;
- data->args.devinfo[i].layout_private = NULL;
- ff_layout_put_mirror(mirror);
- }
-}
-
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -2284,10 +2393,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.read_pagelist = ff_layout_read_pagelist,
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
- .encode_layoutreturn = ff_layout_encode_layoutreturn,
+ .prepare_layoutreturn = ff_layout_prepare_layoutreturn,
.sync = pnfs_nfs_generic_sync,
.prepare_layoutstats = ff_layout_prepare_layoutstats,
- .cleanup_layoutstats = ff_layout_cleanup_layoutstats,
};
static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 3ee0c9fcea76..f4f39b0ab09b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,7 @@
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+#define FF_LAYOUTSTATS_MAXDEV 4
struct nfs4_ff_ds_version {
u32 version;
@@ -73,6 +74,7 @@ struct nfs4_ff_layout_mirror {
struct list_head mirrors;
u32 ds_count;
u32 efficiency;
+ struct nfs4_deviceid devid;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
@@ -81,12 +83,15 @@ struct nfs4_ff_layout_mirror {
struct rpc_cred __rcu *rw_cred;
atomic_t ref;
spinlock_t lock;
+ unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
u32 report_interval;
};
+#define NFS4_FF_MIRROR_STAT_AVAIL (0)
+
struct nfs4_ff_layout_segment {
struct pnfs_layout_segment generic_hdr;
u64 stripe_unit;
@@ -103,6 +108,14 @@ struct nfs4_flexfile_layout {
ktime_t last_report_time; /* Layoutstat report times */
};
+struct nfs4_flexfile_layoutreturn_args {
+ struct list_head errors;
+ struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV];
+ unsigned int num_errors;
+ unsigned int num_dev;
+ struct page *pages[1];
+};
+
static inline struct nfs4_flexfile_layout *
FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
{
@@ -180,9 +193,12 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_mirror *mirror, u64 offset,
u64 length, int status, enum nfs_opnum4 opnum,
gfp_t gfp_flags);
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr, int *count,
- const struct pnfs_layout_range *range);
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
+void ff_layout_free_ds_ioerr(struct list_head *head);
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum);
struct nfs_fh *
nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
@@ -197,7 +213,6 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
struct inode *inode);
struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
u32 ds_idx, struct rpc_cred *mdscred);
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f7a3f6b05369..e5a6f248697b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -20,9 +20,11 @@
static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
static unsigned int dataserver_retrans;
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
{
- if (mirror_ds)
+ if (!IS_ERR_OR_NULL(mirror_ds))
nfs4_put_deviceid_node(&mirror_ds->id_node);
}
@@ -175,19 +177,36 @@ out_err:
static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
struct nfs4_deviceid_node *devid)
{
- nfs4_mark_deviceid_unavailable(devid);
+ nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
if (!ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
lseg);
}
static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_mirror *mirror,
+ bool create)
{
- if (mirror == NULL || mirror->mirror_ds == NULL) {
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
- lseg);
- return false;
+ if (mirror == NULL || IS_ERR(mirror->mirror_ds))
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ if (create) {
+ struct nfs4_deviceid_node *node;
+ struct pnfs_layout_hdr *lh = lseg->pls_layout;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+ &mirror->devid, lh->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
+ } else
+ goto outerr;
}
if (mirror->mirror_ds->ds == NULL) {
struct nfs4_deviceid_node *devid;
@@ -196,15 +215,9 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
return false;
}
return true;
-}
-
-static u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
+outerr:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
+ return false;
}
static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
@@ -212,8 +225,8 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
{
u64 end;
- end = max_t(u64, end_offset(err->offset, err->length),
- end_offset(offset, length));
+ end = max_t(u64, pnfs_end_offset(err->offset, err->length),
+ pnfs_end_offset(offset, length));
err->offset = min_t(u64, err->offset, offset);
err->length = end - err->offset;
}
@@ -235,9 +248,9 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
if (ret != 0)
return ret;
- if (end_offset(e1->offset, e1->length) < e2->offset)
+ if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
return -1;
- if (e1->offset > end_offset(e2->offset, e2->length))
+ if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
return 1;
/* If ranges overlap or are contiguous, they are the same */
return 0;
@@ -263,8 +276,9 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
}
/* Entries match, so merge "err" into "dserr" */
extend_ds_error(dserr, err->offset, err->length);
- list_del(&err->list);
+ list_replace(&err->list, &dserr->list);
kfree(err);
+ return;
}
list_add_tail(&dserr->list, head);
@@ -331,7 +345,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
struct nfs_fh *fh = NULL;
- if (!ff_layout_mirror_valid(lseg, mirror)) {
+ if (!ff_layout_mirror_valid(lseg, mirror, false)) {
pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
__func__, mirror_idx);
goto out;
@@ -371,7 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- if (!ff_layout_mirror_valid(lseg, mirror)) {
+ if (!ff_layout_mirror_valid(lseg, mirror, true)) {
pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
goto out;
@@ -393,8 +407,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
- mirror->mirror_ds->ds_versions[0].minor_version,
- RPC_AUTH_UNIX);
+ mirror->mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
if (ds->ds_clp) {
@@ -457,28 +470,26 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
}
-static bool is_range_intersecting(u64 offset1, u64 length1,
- u64 offset2, u64 length2)
+void ff_layout_free_ds_ioerr(struct list_head *head)
{
- u64 end1 = end_offset(offset1, length1);
- u64 end2 = end_offset(offset2, length2);
+ struct nfs4_ff_layout_ds_err *err;
- return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
- (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+ while (!list_empty(head)) {
+ err = list_first_entry(head,
+ struct nfs4_ff_layout_ds_err,
+ list);
+ list_del(&err->list);
+ kfree(err);
+ }
}
/* called with inode i_lock held */
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr, int *count,
- const struct pnfs_layout_range *range)
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
{
- struct nfs4_ff_layout_ds_err *err, *n;
+ struct nfs4_ff_layout_ds_err *err;
__be32 *p;
- list_for_each_entry_safe(err, n, &flo->error_list, list) {
- if (!is_range_intersecting(err->offset, err->length,
- range->offset, range->length))
- continue;
+ list_for_each_entry(err, head, list) {
/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
* + array length + deviceid(NFS4_DEVICEID4_SIZE)
* + status(4) + opnum(4)
@@ -497,17 +508,59 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(err->status);
*p++ = cpu_to_be32(err->opnum);
- *count += 1;
- list_del(&err->list);
- dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+ dprintk("%s: offset %llu length %llu status %d op %d\n",
__func__, err->offset, err->length, err->status,
- err->opnum, *count);
- kfree(err);
+ err->opnum);
}
return 0;
}
+static
+unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_ff_layout_ds_err *err, *n;
+ unsigned int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry_safe(err, n, &flo->error_list, list) {
+ if (!pnfs_is_range_intersecting(err->offset,
+ pnfs_end_offset(err->offset, err->length),
+ range->offset,
+ pnfs_end_offset(range->offset, range->length)))
+ continue;
+ if (!maxnum)
+ break;
+ list_move(&err->list, head);
+ maxnum--;
+ ret++;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ unsigned int ret;
+
+ ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
+ /* If we're over the max, discard all remaining entries */
+ if (ret == maxnum) {
+ LIST_HEAD(discard);
+ do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
+ ff_layout_free_ds_ioerr(&discard);
+ }
+ return ret;
+}
+
static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -516,7 +569,11 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (mirror && mirror->mirror_ds) {
+ if (mirror) {
+ if (!mirror->mirror_ds)
+ return true;
+ if (IS_ERR(mirror->mirror_ds))
+ continue;
devid = &mirror->mirror_ds->id_node;
if (!ff_layout_test_devid_unavailable(devid))
return true;
@@ -534,8 +591,10 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (!mirror || !mirror->mirror_ds)
+ if (!mirror || IS_ERR(mirror->mirror_ds))
return false;
+ if (!mirror->mirror_ds)
+ continue;
devid = &mirror->mirror_ds->id_node;
if (ff_layout_test_devid_unavailable(devid))
return false;
@@ -544,7 +603,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
}
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
if (lseg->pls_range.iomode == IOMODE_READ)
return ff_read_layout_has_available_ds(lseg);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a608ffd28acc..391dafaf9182 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,7 @@
#include <linux/namei.h>
#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bf4ec5ecc97e..5ca4d96b1942 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,7 +39,7 @@
#include <linux/compat.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -160,6 +160,43 @@ int nfs_sync_mapping(struct address_space *mapping)
return ret;
}
+static int nfs_attribute_timeout(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ /* Special case for the pagecache or access cache */
+ if (flags == NFS_INO_REVAL_PAGECACHE &&
+ !(cache_validity & NFS_INO_REVAL_FORCED))
+ return false;
+ return (cache_validity & flags) != 0;
+}
+
+static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
+{
+ unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+ if ((cache_validity & flags) != 0)
+ return true;
+ if (nfs_attribute_timeout(inode))
+ return true;
+ return false;
+}
+
+bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return nfs_check_cache_invalid_delegated(inode, flags);
+
+ return nfs_check_cache_invalid_not_delegated(inode, flags);
+}
+
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -634,15 +671,28 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
-static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
{
struct dentry *parent;
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
parent = dget_parent(dentry);
nfs_force_use_readdirplus(d_inode(parent));
dput(parent);
}
+static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
+ parent = dget_parent(dentry);
+ nfs_advise_use_readdirplus(d_inode(parent));
+ dput(parent);
+}
+
static bool nfs_need_revalidate_inode(struct inode *inode)
{
if (NFS_I(inode)->cache_validity &
@@ -683,10 +733,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
if (need_atime || nfs_need_revalidate_inode(inode)) {
struct nfs_server *server = NFS_SERVER(inode);
- if (server->caps & NFS_CAP_READDIRPLUS)
- nfs_request_parent_use_readdirplus(dentry);
+ nfs_readdirplus_parent_cache_miss(dentry);
err = __nfs_revalidate_inode(server, inode);
- }
+ } else
+ nfs_readdirplus_parent_cache_hit(dentry);
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -702,8 +752,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
{
atomic_set(&l_ctx->count, 1);
- l_ctx->lockowner.l_owner = current->files;
- l_ctx->lockowner.l_pid = current->tgid;
+ l_ctx->lockowner = current->files;
INIT_LIST_HEAD(&l_ctx->list);
atomic_set(&l_ctx->io_count, 0);
}
@@ -714,9 +763,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
struct nfs_lock_context *pos = head;
do {
- if (pos->lockowner.l_owner != current->files)
- continue;
- if (pos->lockowner.l_pid != current->tgid)
+ if (pos->lockowner != current->files)
continue;
atomic_inc(&pos->count);
return pos;
@@ -785,6 +832,8 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
if (!is_sync)
return;
inode = d_inode(ctx->dentry);
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ return;
nfsi = NFS_I(inode);
if (inode->i_mapping->nrpages == 0)
return;
@@ -799,7 +848,9 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
}
EXPORT_SYMBOL_GPL(nfs_close_context);
-struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
+ fmode_t f_mode,
+ struct file *filp)
{
struct nfs_open_context *ctx;
struct rpc_cred *cred = rpc_lookup_cred();
@@ -818,6 +869,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
ctx->mode = f_mode;
ctx->flags = 0;
ctx->error = 0;
+ ctx->flock_owner = (fl_owner_t)filp;
nfs_init_lock_context(&ctx->lock_context);
ctx->lock_context.open_context = ctx;
INIT_LIST_HEAD(&ctx->list);
@@ -942,7 +994,7 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -1031,13 +1083,6 @@ out:
return status;
}
-int nfs_attribute_timeout(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
-}
-
int nfs_attribute_cache_expired(struct inode *inode)
{
if (nfs_have_delegated_attributes(inode))
@@ -1060,15 +1105,6 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
}
EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
-int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
-{
- if (!(NFS_I(inode)->cache_validity &
- (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
- && !nfs_attribute_cache_expired(inode))
- return NFS_STALE(inode) ? -ESTALE : 0;
- return -ECHILD;
-}
-
static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1099,13 +1135,10 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
return 0;
}
-static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
{
- if (nfs_have_delegated_attributes(inode))
- return false;
- return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
- || nfs_attribute_timeout(inode)
- || NFS_STALE(inode);
+ return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+ NFS_STALE(inode);
}
int nfs_revalidate_mapping_rcu(struct inode *inode)
@@ -1317,7 +1350,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
invalid |= NFS_INO_INVALID_ATIME;
if (invalid != 0)
- nfs_set_cache_invalid(inode, invalid);
+ nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED);
nfsi->read_cache_jiffies = fattr->time_start;
return 0;
@@ -1517,13 +1550,6 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
{
unsigned long invalid = NFS_INO_INVALID_ATTR;
- /*
- * Don't revalidate the pagecache if we hold a delegation, but do
- * force an attribute update
- */
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
- invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
-
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
nfs_set_cache_invalid(inode, invalid);
@@ -2015,7 +2041,7 @@ static void nfsiod_stop(void)
destroy_workqueue(wq);
}
-int nfs_net_id;
+unsigned int nfs_net_id;
EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 80bcc0befb07..09ca5095c04e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -154,8 +154,7 @@ extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
-struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
- rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
@@ -194,14 +193,13 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
unsigned int ds_retrans,
- u32 minor_version,
- rpc_authflavor_t au_flavor);
+ u32 minor_version);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
- unsigned int ds_retrans, rpc_authflavor_t au_flavor);
+ unsigned int ds_retrans);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -346,6 +344,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
/* dir.c */
+extern void nfs_advise_use_readdirplus(struct inode *dir);
extern void nfs_force_use_readdirplus(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
@@ -382,6 +381,7 @@ extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
+extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
extern int nfs_wait_atomic_killable(atomic_t *p);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fbce0d885d4c..5fbd2bde91ba 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -35,6 +35,6 @@ struct nfs_net {
#endif
};
-extern int nfs_net_id;
+extern unsigned int nfs_net_id;
#endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index ee753547fb0a..7879f2a0fcfd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -78,8 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
*/
struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
- int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
- rpc_authflavor_t au_flavor)
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
{
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -106,7 +105,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
/* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, au_flavor);
+ clp = nfs_get_client(&cl_init);
return clp;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 608501971fe0..d12ff9385f49 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -397,10 +397,13 @@ static void
nfs42_layoutstat_release(void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
- struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+ struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo;
+ int i;
- if (nfss->pnfs_curr_ld->cleanup_layoutstats)
- nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+ for (i = 0; i < data->args.num_dev; i++) {
+ if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free)
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
smp_mb__before_atomic();
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 8b2605882a20..6c7296454bbc 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -181,8 +181,9 @@ static void encode_layoutstats(struct xdr_stream *xdr,
NFS4_DEVICEID4_SIZE);
/* Encode layoutupdate4 */
*p++ = cpu_to_be32(devinfo->layout_type);
- if (devinfo->layoutstats_encode != NULL)
- devinfo->layoutstats_encode(xdr, args, devinfo);
+ if (devinfo->ld_private.ops)
+ devinfo->ld_private.ops->encode(xdr, args,
+ &devinfo->ld_private);
else
encode_uint32(xdr, 0);
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1452177c822d..665165833660 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -457,7 +457,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
- const struct nfs_lockowner *, nfs4_stateid *,
+ const struct nfs_lock_context *, nfs4_stateid *,
struct rpc_cred **);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 074ac7131459..5ae9d64ea08b 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -464,6 +464,11 @@ static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
}
+static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
+}
+
/**
* nfs40_walk_client_list - Find server that recognizes a client ID
*
@@ -521,7 +526,21 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (!nfs4_match_client_owner_id(pos, new))
continue;
-
+ /*
+ * We just sent a new SETCLIENTID, which should have
+ * caused the server to return a new cl_confirm. So if
+ * cl_confirm is the same, then this is a different
+ * server that just returned the same cl_confirm by
+ * coincidence:
+ */
+ if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm,
+ &new->cl_confirm))
+ continue;
+ /*
+ * But if the cl_confirm's are different, then the only
+ * way that a SETCLIENTID_CONFIRM to pos can succeed is
+ * if new and pos point to the same server:
+ */
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
@@ -534,6 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
break;
case 0:
nfs4_swap_callback_idents(pos, new);
+ pos->cl_confirm = new->cl_confirm;
prev = NULL;
*result = pos;
@@ -881,7 +901,6 @@ static int nfs4_set_client(struct nfs_server *server,
const struct sockaddr *addr,
const size_t addrlen,
const char *ip_addr,
- rpc_authflavor_t authflavour,
int proto, const struct rpc_timeout *timeparms,
u32 minorversion, struct net *net)
{
@@ -907,7 +926,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, authflavour);
+ clp = nfs_get_client(&cl_init);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
@@ -948,7 +967,7 @@ error:
struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
- u32 minor_version, rpc_authflavor_t au_flavor)
+ u32 minor_version)
{
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -979,7 +998,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, au_flavor);
+ clp = nfs_get_client(&cl_init);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
@@ -1103,7 +1122,6 @@ static int nfs4_init_server(struct nfs_server *server,
(const struct sockaddr *)&data->nfs_server.address,
data->nfs_server.addrlen,
data->client_address,
- data->selected_flavor,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
@@ -1200,7 +1218,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
data->addr,
data->addrlen,
parent_client->cl_ipaddr,
- data->authflavor,
rpc_protocol(parent_server->client),
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
@@ -1311,7 +1328,6 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
- clp->cl_rpcclient->cl_auth->au_flavor,
clp->cl_proto, clnt->cl_timeout,
clp->cl_minorversion, net);
nfs_put_client(clp);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 89a77950e0b0..0efba77789b9 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 241da19b7da4..0a0eaecf9676 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -38,7 +38,6 @@
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/errno.h>
-#include <linux/file.h>
#include <linux/string.h>
#include <linux/ratelimit.h>
#include <linux/printk.h>
@@ -94,7 +93,7 @@ static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fa
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
@@ -226,7 +225,6 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
static const u32 nfs4_open_noattr_bitmap[3] = {
FATTR4_WORD0_TYPE
- | FATTR4_WORD0_CHANGE
| FATTR4_WORD0_FILEID,
};
@@ -817,6 +815,10 @@ static int nfs41_sequence_process(struct rpc_task *task,
case -NFS4ERR_SEQ_FALSE_RETRY:
++slot->seq_nr;
goto retry_nowait;
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_BADSESSION:
+ nfs4_schedule_session_recovery(session, res->sr_status);
+ goto retry_nowait;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -1080,15 +1082,24 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
return nfs4_call_sync_sequence(clnt, server, msg, args, res);
}
-static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+ unsigned long timestamp)
{
struct nfs_inode *nfsi = NFS_I(dir);
spin_lock(&dir->i_lock);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
- if (!cinfo->atomic || cinfo->before != dir->i_version)
+ if (cinfo->atomic && cinfo->before == dir->i_version) {
+ nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+ nfsi->attrtimeo_timestamp = jiffies;
+ } else {
nfs_force_lookup_revalidate(dir);
+ if (cinfo->before != dir->i_version)
+ nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
+ NFS_INO_INVALID_ACL;
+ }
dir->i_version = cinfo->after;
+ nfsi->read_cache_jiffies = timestamp;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfs_fscache_invalidate(dir);
spin_unlock(&dir->i_lock);
@@ -1221,6 +1232,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
atomic_inc(&sp->so_count);
p->o_arg.open_flags = flags;
p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+ p->o_arg.umask = current_umask();
+ p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
p->o_arg.share_access = nfs4_map_atomic_open_share(server,
fmode, flags);
/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
@@ -1228,8 +1241,16 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (!(flags & O_EXCL)) {
/* ask server to check for all possible rights as results
* are cached */
- p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
- NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ |
+ NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND |
+ NFS4_ACCESS_EXECUTE;
+ }
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -1239,7 +1260,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
p->o_arg.label = nfs4_label_copy(p->a_label, label);
- p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
switch (p->o_arg.claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
@@ -2372,11 +2392,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
nfs_fattr_map_and_free_names(server, &data->f_attr);
if (o_arg->open_flags & O_CREAT) {
- update_changeattr(dir, &o_res->cinfo);
if (o_arg->open_flags & O_EXCL)
data->file_created = 1;
else if (o_res->cinfo.before != o_res->cinfo.after)
data->file_created = 1;
+ if (data->file_created || dir->i_version != o_res->cinfo.after)
+ update_changeattr(dir, &o_res->cinfo,
+ o_res->f_attr->time_start);
}
if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
server->caps &= ~NFS_CAP_POSIX_LOCK;
@@ -2678,7 +2700,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
sattr->ia_valid |= ATTR_MTIME;
/* Except MODE, it seems harmless of setting twice. */
- if ((attrset[1] & FATTR4_WORD1_MODE))
+ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
+ attrset[1] & FATTR4_WORD1_MODE)
sattr->ia_valid &= ~ATTR_MODE;
if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
@@ -2819,7 +2842,7 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- state, label, olabel);
+ ctx, label, olabel);
if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr,
opendata->o_res.f_attr);
@@ -2914,7 +2937,7 @@ static int _nfs4_do_setattr(struct inode *inode,
struct nfs_setattrargs *arg,
struct nfs_setattrres *res,
struct rpc_cred *cred,
- struct nfs4_state *state)
+ struct nfs_open_context *ctx)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_message msg = {
@@ -2937,15 +2960,17 @@ static int _nfs4_do_setattr(struct inode *inode,
if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
- } else if (truncate && state != NULL) {
- struct nfs_lockowner lockowner = {
- .l_owner = current->files,
- .l_pid = current->tgid,
- };
- if (!nfs4_valid_open_stateid(state))
+ } else if (truncate && ctx != NULL) {
+ struct nfs_lock_context *l_ctx;
+ if (!nfs4_valid_open_stateid(ctx->state))
return -EBADF;
- if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
- &arg->stateid, &delegation_cred) == -EIO)
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ return PTR_ERR(l_ctx);
+ status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx,
+ &arg->stateid, &delegation_cred);
+ nfs_put_lock_context(l_ctx);
+ if (status == -EIO)
return -EBADF;
} else
nfs4_stateid_copy(&arg->stateid, &zero_stateid);
@@ -2955,7 +2980,7 @@ static int _nfs4_do_setattr(struct inode *inode,
status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
put_rpccred(delegation_cred);
- if (status == 0 && state != NULL)
+ if (status == 0 && ctx != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
@@ -2963,10 +2988,11 @@ static int _nfs4_do_setattr(struct inode *inode,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_state *state = ctx ? ctx->state : NULL;
struct nfs_setattrargs arg = {
.fh = NFS_FH(inode),
.iap = sattr,
@@ -2991,7 +3017,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
arg.bitmask = nfs4_bitmask(server, olabel);
do {
- err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3028,10 +3054,15 @@ struct nfs4_closedata {
struct nfs4_state *state;
struct nfs_closeargs arg;
struct nfs_closeres res;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
struct nfs_fattr fattr;
unsigned long timestamp;
- bool roc;
- u32 roc_barrier;
};
static void nfs4_free_closedata(void *data)
@@ -3040,8 +3071,9 @@ static void nfs4_free_closedata(void *data)
struct nfs4_state_owner *sp = calldata->state->owner;
struct super_block *sb = calldata->state->inode->i_sb;
- if (calldata->roc)
- pnfs_roc_release(calldata->state->inode);
+ if (calldata->lr.roc)
+ pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res,
+ calldata->res.lr_ret);
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
@@ -3060,17 +3092,50 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (calldata->arg.lr_args && task->tk_status != 0) {
+ switch (calldata->res.lr_ret) {
+ default:
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ break;
+ case 0:
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ case -NFS4ERR_WRONG_CRED:
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ calldata->res.lr_ret = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+
/* hmm. we are done with the inode, and in the process of freeing
* the state_owner. we keep this around to process errors
*/
switch (task->tk_status) {
case 0:
res_stateid = &calldata->res.stateid;
- if (calldata->roc)
- pnfs_roc_set_barrier(state->inode,
- calldata->roc_barrier);
renew_lease(server, calldata->timestamp);
break;
+ case -NFS4ERR_ACCESS:
+ if (calldata->arg.bitmask != NULL) {
+ calldata->arg.bitmask = NULL;
+ calldata->res.fattr = NULL;
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out_release;
+
+ }
+ break;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -3096,7 +3161,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
- nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+ nfs_refresh_inode(calldata->inode, &calldata->fattr);
dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
}
@@ -3144,21 +3209,30 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
- if (nfs4_wait_on_layoutreturn(inode, task)) {
+ if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) {
nfs_release_seqid(calldata->arg.seqid);
goto out_wait;
}
if (calldata->arg.fmode == 0)
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
- if (calldata->roc)
- pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
+ if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
+ /* Close-to-open cache consistency revalidation */
+ if (!nfs4_have_delegation(inode, FMODE_READ))
+ calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+ else
+ calldata->arg.bitmask = NULL;
+ }
calldata->arg.share_access =
nfs4_map_atomic_open_share(NFS_SERVER(inode),
calldata->arg.fmode, 0);
- nfs_fattr_init(calldata->res.fattr);
+ if (calldata->res.fattr == NULL)
+ calldata->arg.bitmask = NULL;
+ else if (calldata->arg.bitmask == NULL)
+ calldata->res.fattr = NULL;
calldata->timestamp = jiffies;
if (nfs4_setup_sequence(NFS_SERVER(inode),
&calldata->arg.seq_args,
@@ -3179,13 +3253,6 @@ static const struct rpc_call_ops nfs4_close_ops = {
.rpc_release = nfs4_free_closedata,
};
-static bool nfs4_roc(struct inode *inode)
-{
- if (!nfs_have_layout(inode))
- return false;
- return pnfs_roc(inode);
-}
-
/*
* It is possible for data to be read/written from a mem-mapped file
* after the sys_close call (which hits the vfs layer as a flush).
@@ -3232,12 +3299,19 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
if (IS_ERR(calldata->arg.seqid))
goto out_free_calldata;
+ nfs_fattr_init(&calldata->fattr);
calldata->arg.fmode = 0;
- calldata->arg.bitmask = server->cache_consistency_bitmask;
+ calldata->lr.arg.ld_private = &calldata->lr.ld_private;
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
- calldata->roc = nfs4_roc(state->inode);
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ calldata->lr.roc = pnfs_roc(state->inode,
+ &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred);
+ if (calldata->lr.roc) {
+ calldata->arg.lr_args = &calldata->lr.arg;
+ calldata->res.lr_res = &calldata->lr.res;
+ }
nfs_sb_active(calldata->inode->i_sb);
msg.rpc_argp = &calldata->arg;
@@ -3290,7 +3364,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -3687,7 +3761,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
{
struct inode *inode = d_inode(dentry);
struct rpc_cred *cred = NULL;
- struct nfs4_state *state = NULL;
+ struct nfs_open_context *ctx = NULL;
struct nfs4_label *label = NULL;
int status;
@@ -3708,20 +3782,17 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
/* Search for an existing open(O_WRITE) file */
if (sattr->ia_valid & ATTR_FILE) {
- struct nfs_open_context *ctx;
ctx = nfs_file_open_context(sattr->ia_file);
- if (ctx) {
+ if (ctx)
cred = ctx->cred;
- state = ctx->state;
- }
}
label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
if (IS_ERR(label))
return PTR_ERR(label);
- status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
if (status == 0) {
nfs_setattr_update_inode(inode, sattr, fattr);
nfs_setsecurity(inode, fattr, label);
@@ -3966,18 +4037,20 @@ static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
- ctx = alloc_nfs_open_context(dentry, FMODE_READ);
+ ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
if (IS_ERR(state)) {
status = PTR_ERR(state);
@@ -4004,11 +4077,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name)
.rpc_argp = &args,
.rpc_resp = &res,
};
+ unsigned long timestamp = jiffies;
int status;
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
if (status == 0)
- update_changeattr(dir, &res.cinfo);
+ update_changeattr(dir, &res.cinfo, timestamp);
return status;
}
@@ -4056,7 +4130,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (nfs4_async_handle_error(task, res->server, NULL,
&data->timeout) == -EAGAIN)
return 0;
- update_changeattr(dir, &res->cinfo);
+ if (task->tk_status == 0)
+ update_changeattr(dir, &res->cinfo, res->dir_attr->time_start);
return 1;
}
@@ -4090,8 +4165,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
return 0;
- update_changeattr(old_dir, &res->old_cinfo);
- update_changeattr(new_dir, &res->new_cinfo);
+ if (task->tk_status == 0) {
+ update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start);
+ if (new_dir != old_dir)
+ update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start);
+ }
return 1;
}
@@ -4128,7 +4206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
- update_changeattr(dir, &res.cinfo);
+ update_changeattr(dir, &res.cinfo, res.fattr->time_start);
status = nfs_post_op_update_inode(inode, res.fattr);
if (!status)
nfs_setsecurity(inode, res.fattr, res.label);
@@ -4185,6 +4263,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.attrs = sattr;
data->arg.ftype = ftype;
data->arg.bitmask = nfs4_bitmask(server, data->label);
+ data->arg.umask = current_umask();
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
@@ -4202,7 +4281,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
- update_changeattr(dir, &data->res.dir_cinfo);
+ update_changeattr(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start);
status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
}
return status;
@@ -4282,13 +4362,15 @@ out:
static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_label l, *label = NULL;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
do {
err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
@@ -4391,13 +4473,15 @@ out:
static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_label l, *label = NULL;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
do {
err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
trace_nfs4_mknod(dir, &dentry->d_name, err);
@@ -4541,11 +4625,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_lock_context *l_ctx,
fmode_t fmode)
{
- const struct nfs_lockowner *lockowner = NULL;
-
- if (l_ctx != NULL)
- lockowner = &l_ctx->lockowner;
- return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
+ return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL);
}
EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
@@ -5564,11 +5644,16 @@ struct nfs4_delegreturndata {
struct nfs_fh fh;
nfs4_stateid stateid;
unsigned long timestamp;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
struct nfs_fattr fattr;
int rpc_status;
struct inode *inode;
- bool roc;
- u32 roc_barrier;
};
static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -5579,6 +5664,32 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
return;
trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (data->args.lr_args && task->tk_status != 0) {
+ switch(data->res.lr_ret) {
+ default:
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ break;
+ case 0:
+ data->args.lr_args = NULL;
+ data->res.lr_res = NULL;
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ case -NFS4ERR_WRONG_CRED:
+ data->args.lr_args = NULL;
+ data->res.lr_res = NULL;
+ data->res.lr_ret = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+
switch (task->tk_status) {
case 0:
renew_lease(data->res.server, data->timestamp);
@@ -5594,6 +5705,14 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_STALE_STATEID:
task->tk_status = 0;
break;
+ case -NFS4ERR_ACCESS:
+ if (data->args.bitmask) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ }
default:
if (nfs4_async_handle_error(task, data->res.server,
NULL, NULL) == -EAGAIN) {
@@ -5602,8 +5721,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
}
}
data->rpc_status = task->tk_status;
- if (data->roc && data->rpc_status == 0)
- pnfs_roc_set_barrier(data->inode, data->roc_barrier);
}
static void nfs4_delegreturn_release(void *calldata)
@@ -5612,8 +5729,10 @@ static void nfs4_delegreturn_release(void *calldata)
struct inode *inode = data->inode;
if (inode) {
- if (data->roc)
- pnfs_roc_release(inode);
+ if (data->lr.roc)
+ pnfs_roc_release(&data->lr.arg, &data->lr.res,
+ data->res.lr_ret);
+ nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -5625,12 +5744,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (nfs4_wait_on_layoutreturn(d_data->inode, task))
+ if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task))
return;
- if (d_data->roc)
- pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
-
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
&d_data->res.seq_res,
@@ -5676,12 +5792,22 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs4_stateid_copy(&data->stateid, stateid);
data->res.fattr = &data->fattr;
data->res.server = server;
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ data->lr.arg.ld_private = &data->lr.ld_private;
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
+ data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred);
data->inode = nfs_igrab_and_active(inode);
- if (data->inode)
- data->roc = nfs4_roc(inode);
+ if (data->inode) {
+ if (data->lr.roc) {
+ data->args.lr_args = &data->lr.arg;
+ data->res.lr_res = &data->lr.res;
+ }
+ } else if (data->lr.roc) {
+ pnfs_roc_release(&data->lr.arg, &data->lr.res, 0);
+ data->lr.roc = false;
+ }
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
@@ -5695,10 +5821,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (status != 0)
goto out;
status = data->rpc_status;
- if (status == 0)
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
- else
- nfs_refresh_inode(inode, &data->fattr);
out:
rpc_put_task(task);
return status;
@@ -6015,7 +6137,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->server = server;
atomic_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
- get_file(fl->fl_file);
memcpy(&p->fl, fl, sizeof(p->fl));
return p;
out_free_seqid:
@@ -6128,7 +6249,6 @@ static void nfs4_lock_release(void *calldata)
nfs_free_seqid(data->arg.lock_seqid);
nfs4_put_lock_state(data->lsp);
put_nfs_open_context(data->ctx);
- fput(data->fl.fl_file);
kfree(data);
dprintk("%s: done!\n", __func__);
}
@@ -8371,6 +8491,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
goto out;
}
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
err = nfs4_handle_exception(server, nfs4err, exception);
if (!status) {
if (exception->retry)
@@ -8559,21 +8680,13 @@ static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
- LIST_HEAD(freeme);
dprintk("--> %s\n", __func__);
- spin_lock(&lo->plh_inode->i_lock);
- if (lrp->res.lrs_present) {
- pnfs_mark_matching_lsegs_invalid(lo, &freeme,
- &lrp->args.range,
- be32_to_cpu(lrp->args.stateid.seqid));
- pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- } else
- pnfs_mark_layout_stateid_invalid(lo, &freeme);
- pnfs_clear_layoutreturn_waitbit(lo);
- spin_unlock(&lo->plh_inode->i_lock);
+ pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+ lrp->res.lrs_present ? &lrp->res.stateid : NULL);
nfs4_sequence_free_slot(&lrp->res.seq_res);
- pnfs_free_lseg_list(&freeme);
+ if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+ lrp->ld_private.ops->free(&lrp->ld_private);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
kfree(calldata);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a61350f75c74..769b85655c4b 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -169,7 +169,7 @@ bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
{
if (slotid <= tbl->max_slotid)
- return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT);
return ERR_PTR(-E2BIG);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0959c9661662..daeb94e3acd4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -494,21 +494,18 @@ nfs4_alloc_state_owner(struct nfs_server *server,
}
static void
-nfs4_drop_state_owner(struct nfs4_state_owner *sp)
-{
- struct rb_node *rb_node = &sp->so_server_node;
-
- if (!RB_EMPTY_NODE(rb_node)) {
- struct nfs_server *server = sp->so_server;
- struct nfs_client *clp = server->nfs_client;
-
- spin_lock(&clp->cl_lock);
- if (!RB_EMPTY_NODE(rb_node)) {
- rb_erase(rb_node, &server->state_owners);
- RB_CLEAR_NODE(rb_node);
- }
- spin_unlock(&clp->cl_lock);
- }
+nfs4_reset_state_owner(struct nfs4_state_owner *sp)
+{
+ /* This state_owner is no longer usable, but must
+ * remain in place so that state recovery can find it
+ * and the opens associated with it.
+ * It may also be used for new 'open' request to
+ * return a delegation to the server.
+ * So update the 'create_time' so that it looks like
+ * a new state_owner. This will cause the server to
+ * request an OPEN_CONFIRM to start a new sequence.
+ */
+ sp->so_seqid.create_time = ktime_get();
}
static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
@@ -797,19 +794,33 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
/*
* Search the state->lock_states for an existing lock_owner
- * that is compatible with current->files
+ * that is compatible with either of the given owners.
+ * If the second is non-zero, then the first refers to a Posix-lock
+ * owner (current->files) and the second refers to a flock/OFD
+ * owner (struct file*). In that case, prefer a match for the first
+ * owner.
+ * If both sorts of locks are held on the one file we cannot know
+ * which stateid was intended to be used, so a "correct" choice cannot
+ * be made. Failing that, a "consistent" choice is preferable. The
+ * consistent choice we make is to prefer the first owner, that of a
+ * Posix lock.
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state,
+ fl_owner_t fl_owner, fl_owner_t fl_owner2)
{
- struct nfs4_lock_state *pos;
+ struct nfs4_lock_state *pos, *ret = NULL;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (pos->ls_owner != fl_owner)
- continue;
- atomic_inc(&pos->ls_count);
- return pos;
+ if (pos->ls_owner == fl_owner) {
+ ret = pos;
+ break;
+ }
+ if (pos->ls_owner == fl_owner2)
+ ret = pos;
}
- return NULL;
+ if (ret)
+ atomic_inc(&ret->ls_count);
+ return ret;
}
/*
@@ -857,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
for(;;) {
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, owner);
+ lsp = __nfs4_find_lock_state(state, owner, 0);
if (lsp != NULL)
break;
if (new != NULL) {
@@ -939,22 +950,23 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
struct nfs4_state *state,
- const struct nfs_lockowner *lockowner)
+ const struct nfs_lock_context *l_ctx)
{
struct nfs4_lock_state *lsp;
- fl_owner_t fl_owner;
+ fl_owner_t fl_owner, fl_flock_owner;
int ret = -ENOENT;
-
- if (lockowner == NULL)
+ if (l_ctx == NULL)
goto out;
if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
goto out;
- fl_owner = lockowner->l_owner;
+ fl_owner = l_ctx->lockowner;
+ fl_flock_owner = l_ctx->open_context->flock_owner;
+
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner);
+ lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -986,7 +998,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
* requests.
*/
int nfs4_select_rw_stateid(struct nfs4_state *state,
- fmode_t fmode, const struct nfs_lockowner *lockowner,
+ fmode_t fmode, const struct nfs_lock_context *l_ctx,
nfs4_stateid *dst, struct rpc_cred **cred)
{
int ret;
@@ -995,7 +1007,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
return -EIO;
if (cred != NULL)
*cred = NULL;
- ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ ret = nfs4_copy_lock_stateid(dst, state, l_ctx);
if (ret == -EIO)
/* A lost lock - don't even consider delegations */
goto out;
@@ -1079,6 +1091,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
case -NFS4ERR_BADXDR:
case -NFS4ERR_RESOURCE:
case -NFS4ERR_NOFILEHANDLE:
+ case -NFS4ERR_MOVED:
/* Non-seqid mutating errors */
return;
};
@@ -1098,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
if (status == -NFS4ERR_BAD_SEQID)
- nfs4_drop_state_owner(sp);
+ nfs4_reset_state_owner(sp);
if (!nfs4_has_session(sp->so_server->nfs_client))
nfs_increment_seqid(status, seqid);
}
@@ -1717,7 +1730,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
break;
case -NFS4ERR_STALE_CLIENTID:
set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
break;
case -NFS4ERR_EXPIRED:
@@ -2190,7 +2202,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
}
- nfs4_schedule_lease_recovery(clp);
+ nfs4_schedule_state_manager(clp);
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fc89e5ed07ee..e9255cb453e6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
+#include <linux/fs_struct.h>
#include "nfs4_fs.h"
#include "internal.h"
@@ -415,6 +416,8 @@ static int nfs4_stat_to_errno(int);
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
#endif /* CONFIG_NFS_V4_1 */
#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
@@ -499,22 +502,24 @@ static int nfs4_stat_to_errno(int);
(compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
- encode_open_downgrade_maxsz + \
- encode_getattr_maxsz)
+ encode_layoutreturn_maxsz + \
+ encode_open_downgrade_maxsz)
#define NFS4_dec_open_downgrade_sz \
(compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
- decode_open_downgrade_maxsz + \
- decode_getattr_maxsz)
+ decode_layoutreturn_maxsz + \
+ decode_open_downgrade_maxsz)
#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
encode_close_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
decode_close_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
@@ -708,10 +713,13 @@ static int nfs4_stat_to_errno(int);
#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
encode_delegreturn_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
decode_delegreturn_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
@@ -1003,7 +1011,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
const struct nfs_server *server,
- bool excl_check)
+ bool excl_check, const umode_t *umask)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1017,18 +1025,21 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
/*
* We reserve enough space to write the entire attribute buffer at once.
- * In the worst-case, this would be
- * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
- * = 40 bytes, plus any contribution from variable-length fields
- * such as owner/group.
*/
if (iap->ia_valid & ATTR_SIZE) {
bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
}
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ umask = NULL;
if (iap->ia_valid & ATTR_MODE) {
- bmval[1] |= FATTR4_WORD1_MODE;
- len += 4;
+ if (umask) {
+ bmval[2] |= FATTR4_WORD2_MODE_UMASK;
+ len += 8;
+ } else {
+ bmval[1] |= FATTR4_WORD1_MODE;
+ len += 4;
+ }
}
if (iap->ia_valid & ATTR_UID) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
@@ -1129,6 +1140,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
*p++ = cpu_to_be32(label->len);
p = xdr_encode_opaque_fixed(p, label->label, label->len);
}
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(*umask);
+ }
/* out: */
}
@@ -1183,7 +1198,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server, false);
+ encode_attrs(xdr, create->attrs, create->label, create->server, false,
+ &create->umask);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1403,11 +1419,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+ &arg->umask);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+ &arg->umask);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1416,7 +1434,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true,
+ &arg->umask);
}
}
@@ -1672,7 +1691,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server, false);
+ encode_attrs(xdr, arg->iap, arg->label, server, false, NULL);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2015,6 +2034,7 @@ encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args,
struct compound_hdr *hdr)
{
+ const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld;
__be32 *p;
encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
@@ -2029,10 +2049,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
spin_lock(&args->inode->i_lock);
encode_nfs4_stateid(xdr, &args->stateid);
spin_unlock(&args->inode->i_lock);
- if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
- NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
- NFS_I(args->inode)->layout, xdr, args);
- } else
+ if (args->ld_private->ops && args->ld_private->ops->encode)
+ args->ld_private->ops->encode(xdr, args, args->ld_private);
+ else if (lr_ops->encode_layoutreturn)
+ lr_ops->encode_layoutreturn(xdr, args);
+ else
encode_uint32(xdr, 0);
}
@@ -2062,6 +2083,13 @@ static void encode_free_stateid(struct xdr_stream *xdr,
encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
encode_nfs4_stateid(xdr, &args->stateid);
}
+#else
+static inline void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -2249,8 +2277,11 @@ static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask != NULL)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_close(xdr, args, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2327,8 +2358,9 @@ static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
encode_open_downgrade(xdr, args, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2671,7 +2703,10 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_delegreturn(xdr, args->stateid, &hdr);
encode_nops(&hdr);
}
@@ -6089,6 +6124,13 @@ static int decode_free_stateid(struct xdr_stream *xdr,
res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
return res->status;
}
+#else
+static inline
+int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ return 0;
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -6114,10 +6156,13 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
status = decode_putfh(xdr);
if (status)
goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
status = decode_open_downgrade(xdr, res);
- if (status != 0)
- goto out;
- decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6444,16 +6489,18 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr != NULL) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
status = decode_close(xdr, res);
- if (status != 0)
- goto out;
- /*
- * Note: Server may do delete on close for this file
- * in which case the getattr call will fail with
- * an ESTALE error. Shouldn't be a problem,
- * though, since fattr->valid will remain unset.
- */
- decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6920,9 +6967,17 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
status = decode_putfh(xdr);
if (status != 0)
goto out;
- status = decode_getfattr(xdr, res->fattr, res->server);
- if (status != 0)
- goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
+ if (res->fattr) {
+ status = decode_getfattr(xdr, res->fattr, res->server);
+ if (status != 0)
+ goto out;
+ }
status = decode_delegreturn(xdr);
out:
return status;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 919efd4a1a23..2a4cdce939a0 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -504,10 +504,10 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
}
void
-objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
- struct xdr_stream *xdr,
+objlayout_encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args)
{
+ struct pnfs_layout_hdr *pnfslay = args->layout;
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct objlayout_io_res *oir, *tmp;
__be32 *start;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 2641dbad345c..fc94a5872ed4 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -175,7 +175,6 @@ extern void objlayout_encode_layoutcommit(
const struct nfs4_layoutcommit_args *);
extern void objlayout_encode_layoutreturn(
- struct pnfs_layout_hdr *,
struct xdr_stream *,
const struct nfs4_layoutreturn_args *);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 965db474f4b0..6e629b856a00 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -867,8 +867,7 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
- return l1->lockowner.l_owner == l2->lockowner.l_owner
- && l1->lockowner.l_pid == l2->lockowner.l_pid;
+ return l1->lockowner == l2->lockowner;
}
/**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 259ef85f435a..dd042498ce7c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -54,6 +54,12 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
static LIST_HEAD(pnfs_modules_tbl);
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
+static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq);
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -299,6 +305,49 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+static void
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
+{
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (seq != 0) {
+ WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+ lo->plh_return_seq = seq;
+ }
+}
+
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
+static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+ clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+}
+
+static void
+pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+}
+
/*
* Mark a pnfs_layout_hdr and all associated layout segments as invalid
*
@@ -315,9 +364,17 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
+ struct pnfs_layout_segment *lseg, *next;
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
+ pnfs_clear_layoutreturn_info(lo);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+ !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ pnfs_clear_layoutreturn_waitbit(lo);
+ return !list_empty(&lo->plh_segs);
}
static int
@@ -396,27 +453,42 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct inode *ino = lseg->pls_layout->plh_inode;
-
- NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ if (lseg != NULL) {
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
+ }
}
static void
pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg)
{
- struct inode *inode = lo->plh_inode;
-
WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list);
/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
atomic_dec(&lo->plh_refcount);
- if (list_empty(&lo->plh_segs)) {
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ return;
+ if (list_empty(&lo->plh_segs) &&
+ !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
if (atomic_read(&lo->plh_outstanding) == 0)
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
- rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+static bool
+pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+ pnfs_layout_is_valid(lo)) {
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
+ return true;
+ }
+ return false;
}
void
@@ -442,6 +514,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
+ if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
+ lseg = NULL;
spin_unlock(&inode->i_lock);
pnfs_free_lseg(lseg);
pnfs_put_layout_hdr(lo);
@@ -482,22 +556,15 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
struct pnfs_layout_hdr *lo = lseg->pls_layout;
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
return;
- pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
- pnfs_free_lseg_async(lseg);
+ if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) {
+ pnfs_get_layout_hdr(lo);
+ pnfs_free_lseg_async(lseg);
+ }
}
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
-static u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
-}
-
/*
* is l2 fully contained in l1?
* start1 end1
@@ -510,33 +577,13 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
- u64 end1 = end_offset(start1, l1->length);
+ u64 end1 = pnfs_end_offset(start1, l1->length);
u64 start2 = l2->offset;
- u64 end2 = end_offset(start2, l2->length);
+ u64 end2 = pnfs_end_offset(start2, l2->length);
return (start1 <= start2) && (end1 >= end2);
}
-/*
- * is l1 and l2 intersecting?
- * start1 end1
- * [----------------------------------)
- * start2 end2
- * [----------------)
- */
-static bool
-pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
- const struct pnfs_layout_range *l2)
-{
- u64 start1 = l1->offset;
- u64 end1 = end_offset(start1, l1->length);
- u64 start2 = l2->offset;
- u64 end2 = end_offset(start2, l2->length);
-
- return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
- (end2 == NFS4_MAX_UINT64 || end2 > start1);
-}
-
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
@@ -637,6 +684,20 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return remaining;
}
+static void
+pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
+ if (pnfs_match_lseg_recall(lseg, range, seq))
+ list_move_tail(&lseg->pls_list, free_me);
+ }
+}
+
/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
@@ -701,6 +762,8 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
struct inode *inode;
list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ continue;
inode = igrab(lo->plh_inode);
if (inode == NULL)
continue;
@@ -816,14 +879,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
- lo->plh_return_iomode = 0;
- lo->plh_return_seq = 0;
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -941,12 +996,31 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
}
}
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
{
- clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
- smp_mb__after_atomic();
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+ struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
+ !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ goto out_unlock;
+ if (stateid) {
+ u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
+ pnfs_free_returned_lsegs(lo, &freeme, range, seq);
+ pnfs_set_layout_stateid(lo, stateid, true);
+ } else
+ pnfs_mark_layout_stateid_invalid(lo, &freeme);
+out_unlock:
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
+
}
static bool
@@ -957,8 +1031,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
/* Serialise LAYOUTGET/LAYOUTRETURN */
if (atomic_read(&lo->plh_outstanding) != 0)
return false;
- if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
return false;
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
if (stateid != NULL) {
@@ -978,11 +1053,29 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
return true;
}
+static void
+pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
+ struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ enum pnfs_iomode iomode)
+{
+ struct inode *inode = lo->plh_inode;
+
+ args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+ args->inode = inode;
+ args->range.iomode = iomode;
+ args->range.offset = 0;
+ args->range.length = NFS4_MAX_UINT64;
+ args->layout = lo;
+ nfs4_stateid_copy(&args->stateid, stateid);
+}
+
static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
struct nfs4_layoutreturn *lrp;
int status = 0;
@@ -996,15 +1089,12 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
goto out;
}
- nfs4_stateid_copy(&lrp->args.stateid, stateid);
- lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
- lrp->args.inode = ino;
- lrp->args.range.iomode = iomode;
- lrp->args.range.offset = 0;
- lrp->args.range.length = NFS4_MAX_UINT64;
- lrp->args.layout = lo;
+ pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
+ lrp->args.ld_private = &lrp->ld_private;
lrp->clp = NFS_SERVER(ino)->nfs_client;
lrp->cred = lo->plh_lc_cred;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(&lrp->args);
status = nfs4_proc_layoutreturn(lrp, sync);
out:
@@ -1067,7 +1157,7 @@ _pnfs_return_layout(struct inode *ino)
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
nfs4_stateid stateid;
- int status = 0, empty;
+ int status = 0;
bool send;
dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
@@ -1081,7 +1171,14 @@ _pnfs_return_layout(struct inode *ino)
}
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
- empty = list_empty(&lo->plh_segs);
+ /* Is there an outstanding layoutreturn ? */
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE))
+ goto out_put_layout_hdr;
+ spin_lock(&ino->i_lock);
+ }
pnfs_clear_layoutcommit(ino, &tmp_list);
pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
@@ -1095,7 +1192,7 @@ _pnfs_return_layout(struct inode *ino)
}
/* Don't send a LAYOUTRETURN if list was initially empty */
- if (empty) {
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("NFS: %s no layout segments to return\n", __func__);
goto out_put_layout_hdr;
@@ -1103,10 +1200,10 @@ _pnfs_return_layout(struct inode *ino)
send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
if (send)
status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
out_put_layout_hdr:
+ pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
out:
dprintk("<-- %s status: %d\n", __func__, status);
@@ -1141,105 +1238,125 @@ pnfs_commit_and_return_layout(struct inode *inode)
return ret;
}
-bool pnfs_roc(struct inode *ino)
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_open_context *ctx;
struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
- struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_segment *lseg, *next;
nfs4_stateid stateid;
- LIST_HEAD(tmp_list);
- bool found = false, layoutreturn = false, roc = false;
+ enum pnfs_iomode iomode = 0;
+ bool layoutreturn = false, roc = false;
+ bool skip_read = false;
+ if (!nfs_have_layout(ino))
+ return false;
+retry:
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ if (!lo || !pnfs_layout_is_valid(lo) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
goto out_noroc;
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ pnfs_get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE);
+ pnfs_put_layout_hdr(lo);
+ goto retry;
+ }
/* no roc if we hold a delegation */
- if (nfs4_check_delegation(ino, FMODE_READ))
- goto out_noroc;
+ if (nfs4_check_delegation(ino, FMODE_READ)) {
+ if (nfs4_check_delegation(ino, FMODE_WRITE))
+ goto out_noroc;
+ skip_read = true;
+ }
list_for_each_entry(ctx, &nfsi->open_files, list) {
state = ctx->state;
+ if (state == NULL)
+ continue;
/* Don't return layout if there is open file state */
- if (state != NULL && state->state != 0)
+ if (state->state & FMODE_WRITE)
goto out_noroc;
+ if (state->state & FMODE_READ)
+ skip_read = true;
}
- /* always send layoutreturn if being marked so */
- if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo,
- &stateid, NULL);
- list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
+ if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
+ continue;
/* If we are sending layoutreturn, invalidate all valid lsegs */
- if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
- mark_lseg_invalid(lseg, &tmp_list);
- found = true;
- }
+ if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+ continue;
+ /*
+ * Note: mark lseg for return so pnfs_layout_remove_lseg
+ * doesn't invalidate the layout for us.
+ */
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ goto out_noroc;
+
/* ROC in two conditions:
* 1. there are ROC lsegs
* 2. we don't send layoutreturn
*/
- if (found && !layoutreturn) {
- /* lo ref dropped in pnfs_roc_release() */
- pnfs_get_layout_hdr(lo);
- roc = true;
- }
+ /* lo ref dropped in pnfs_roc_release() */
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ /* If the creds don't match, we can't compound the layoutreturn */
+ if (!layoutreturn || cred != lo->plh_lc_cred)
+ goto out_noroc;
+
+ roc = layoutreturn;
+ pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
+ res->lrs_present = 0;
+ layoutreturn = false;
out_noroc:
spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
pnfs_layoutcommit_inode(ino, true);
+ if (roc) {
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(args);
+ return true;
+ }
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
- return roc;
-}
-
-void pnfs_roc_release(struct inode *ino)
-{
- struct pnfs_layout_hdr *lo;
-
- spin_lock(&ino->i_lock);
- lo = NFS_I(ino)->layout;
- pnfs_clear_layoutreturn_waitbit(lo);
- if (atomic_dec_and_test(&lo->plh_refcount)) {
- pnfs_detach_layout_hdr(lo);
- spin_unlock(&ino->i_lock);
- pnfs_free_layout_hdr(lo);
- } else
- spin_unlock(&ino->i_lock);
-}
-
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
-{
- struct pnfs_layout_hdr *lo;
-
- spin_lock(&ino->i_lock);
- lo = NFS_I(ino)->layout;
- if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
- lo->plh_barrier = barrier;
- spin_unlock(&ino->i_lock);
- trace_nfs4_layoutreturn_on_close(ino, 0);
+ pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ return false;
}
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
{
- struct nfs_inode *nfsi = NFS_I(ino);
- struct pnfs_layout_hdr *lo;
- u32 current_seqid;
-
- spin_lock(&ino->i_lock);
- lo = nfsi->layout;
- current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
+ struct pnfs_layout_hdr *lo = args->layout;
+ const nfs4_stateid *arg_stateid = NULL;
+ const nfs4_stateid *res_stateid = NULL;
+ struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
- /* Since close does not return a layout stateid for use as
- * a barrier, we choose the worst-case barrier.
- */
- *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
- spin_unlock(&ino->i_lock);
+ if (ret == 0) {
+ arg_stateid = &args->stateid;
+ if (res->lrs_present)
+ res_stateid = &res->stateid;
+ }
+ pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
+ res_stateid);
+ if (ld_private && ld_private->ops && ld_private->ops->free)
+ ld_private->ops->free(ld_private);
+ pnfs_put_layout_hdr(lo);
+ trace_nfs4_layoutreturn_on_close(args->inode, 0);
}
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
@@ -1252,13 +1369,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
* i_lock */
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
sleep = true;
+ }
spin_unlock(&ino->i_lock);
-
- if (sleep)
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
return sleep;
}
@@ -1375,6 +1490,7 @@ alloc_init_layout_hdr(struct inode *ino,
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_return_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
lo->plh_lc_cred = get_rpccred(ctx->cred);
@@ -1841,7 +1957,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ if (!pnfs_layout_is_valid(lo)) {
+ /* We have a completely new layout */
+ pnfs_set_layout_stateid(lo, &res->stateid, true);
+ } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
@@ -1851,12 +1970,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
} else {
/*
* We got an entirely new state ID. Mark all segments for the
- * inode invalid, and don't bother validating the stateid
- * sequence number.
+ * inode invalid, and retry the layoutget
*/
pnfs_mark_layout_stateid_invalid(lo, &free_me);
-
- pnfs_set_layout_stateid(lo, &res->stateid, true);
+ goto out_forget;
}
pnfs_get_lseg(lseg);
@@ -1877,20 +1994,6 @@ out_forget:
return ERR_PTR(-EAGAIN);
}
-static void
-pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
- u32 seq)
-{
- if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
- iomode = IOMODE_ANY;
- lo->plh_return_iomode = iomode;
- set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
- if (seq != 0) {
- WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
- lo->plh_return_seq = seq;
- }
-}
-
/**
* pnfs_mark_matching_lsegs_return - Free or return matching layout segments
* @lo: pointer to layout header
@@ -1945,17 +2048,18 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
- LIST_HEAD(free_me);
bool return_now = false;
spin_lock(&inode->i_lock);
pnfs_set_plh_return_info(lo, range.iomode, 0);
+ /* Block LAYOUTGET */
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) {
nfs4_stateid stateid;
enum pnfs_iomode iomode;
@@ -1967,7 +2071,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
}
- pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -2063,7 +2166,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
*
*/
if (pgio->pg_lseg) {
- seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+ seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
pgio->pg_lseg->pls_range.length);
req_start = req_offset(req);
WARN_ON_ONCE(req_start >= seg_end);
@@ -2286,6 +2389,10 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
struct nfs_pageio_descriptor pgio;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ /* Prevent deadlocks with layoutreturn! */
+ pnfs_put_lseg(hdr->lseg);
+ hdr->lseg = NULL;
+
nfs_pageio_init_read(&pgio, hdr->inode, false,
hdr->completion_ops);
hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5c295512c967..63f77b49a586 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -96,6 +96,7 @@ enum {
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */
NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
@@ -171,8 +172,8 @@ struct pnfs_layoutdriver_type {
(struct nfs_server *server, struct pnfs_device *pdev,
gfp_t gfp_flags);
- void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
- struct xdr_stream *xdr,
+ int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
+ void (*encode_layoutreturn) (struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
@@ -181,7 +182,6 @@ struct pnfs_layoutdriver_type {
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
- void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
};
struct pnfs_layout_hdr {
@@ -190,6 +190,7 @@ struct pnfs_layout_hdr {
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
+ struct list_head plh_return_segs; /* invalid layout segments */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
@@ -270,10 +271,13 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
u32 seq);
int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list);
-bool pnfs_roc(struct inode *ino);
-void pnfs_roc_release(struct inode *ino);
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred);
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret);
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
@@ -292,7 +296,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
enum pnfs_iomode iomode,
bool strict_iomode,
gfp_t gfp_flags);
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid);
void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg,
@@ -362,8 +369,7 @@ struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
void nfs4_pnfs_v3_ds_connect_unload(void);
void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
struct nfs4_deviceid_node *devid, unsigned int timeo,
- unsigned int retrans, u32 version, u32 minor_version,
- rpc_authflavor_t au_flavor);
+ unsigned int retrans, u32 version, u32 minor_version);
struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
struct xdr_stream *xdr,
gfp_t gfp_flags);
@@ -559,6 +565,38 @@ pnfs_copy_range(struct pnfs_layout_range *dst,
memcpy(dst, src, sizeof(*dst));
}
+static inline u64
+pnfs_end_offset(u64 start, u64 len)
+{
+ if (NFS4_MAX_UINT64 - start <= len)
+ return NFS4_MAX_UINT64;
+ return start + len;
+}
+
+/*
+ * Are 2 ranges intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline bool
+pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2)
+{
+ return (end1 == NFS4_MAX_UINT64 || start2 < end1) &&
+ (end2 == NFS4_MAX_UINT64 || start1 < end2);
+}
+
+static inline bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1 = pnfs_end_offset(l1->offset, l1->length);
+ u64 end2 = pnfs_end_offset(l2->offset, l2->length);
+
+ return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
@@ -630,23 +668,18 @@ pnfs_layoutcommit_outstanding(struct inode *inode)
static inline bool
-pnfs_roc(struct inode *ino)
+pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred)
{
return false;
}
static inline void
-pnfs_roc_release(struct inode *ino)
-{
-}
-
-static inline void
-pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
-{
-}
-
-static inline void
-pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
{
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 53b4705abcc7..9414b492439f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -600,8 +600,7 @@ static struct nfs_client *(*get_v3_ds_connect)(
int ds_addrlen,
int ds_proto,
unsigned int ds_timeo,
- unsigned int ds_retrans,
- rpc_authflavor_t au_flavor);
+ unsigned int ds_retrans);
static bool load_v3_ds_connect(void)
{
@@ -625,15 +624,13 @@ EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
unsigned int timeo,
- unsigned int retrans,
- rpc_authflavor_t au_flavor)
+ unsigned int retrans)
{
struct nfs_client *clp = ERR_PTR(-EIO);
struct nfs4_pnfs_ds_addr *da;
int status = 0;
- dprintk("--> %s DS %s au_flavor %d\n", __func__,
- ds->ds_remotestr, au_flavor);
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
if (!load_v3_ds_connect())
goto out;
@@ -657,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
- timeo, retrans, au_flavor);
+ timeo, retrans);
}
if (IS_ERR(clp)) {
@@ -676,15 +673,13 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
unsigned int timeo,
unsigned int retrans,
- u32 minor_version,
- rpc_authflavor_t au_flavor)
+ u32 minor_version)
{
struct nfs_client *clp = ERR_PTR(-EIO);
struct nfs4_pnfs_ds_addr *da;
int status = 0;
- dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
- au_flavor);
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
list_for_each_entry(da, &ds->ds_addrs, da_node) {
dprintk("%s: DS %s: trying address %s\n",
@@ -720,8 +715,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
- timeo, retrans, minor_version,
- au_flavor);
+ timeo, retrans, minor_version);
if (IS_ERR(clp))
continue;
@@ -755,19 +749,17 @@ out:
*/
void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
struct nfs4_deviceid_node *devid, unsigned int timeo,
- unsigned int retrans, u32 version,
- u32 minor_version, rpc_authflavor_t au_flavor)
+ unsigned int retrans, u32 version, u32 minor_version)
{
if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
int err = 0;
if (version == 3) {
err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
- retrans, au_flavor);
+ retrans);
} else if (version == 4) {
err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
- retrans, minor_version,
- au_flavor);
+ retrans, minor_version);
} else {
dprintk("%s: unsupported DS version %d\n", __func__,
version);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 001796bcd6c8..6bca17883b93 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -55,7 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/rcupdate.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "nfs4_fs.h"
#include "callback.h"
@@ -2904,7 +2904,7 @@ module_param(max_session_slots, ushort, 0644);
MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
"requests the client will negotiate");
module_param(max_session_cb_slots, ushort, 0644);
-MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 "
"callbacks the client will process for a given server");
module_param(send_implementation_id, ushort, 0644);
MODULE_PARM_DESC(send_implementation_id,
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 4fe3eead3868..5a1d0ded8979 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -77,7 +77,6 @@ static const char *nfs_get_link(struct dentry *dentry,
* symlinks can't do much...
*/
const struct inode_operations nfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = nfs_get_link,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53211838f72a..b00d53d13d47 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -24,7 +24,7 @@
#include <linux/freezer.h>
#include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "delegation.h"
#include "internal.h"
@@ -1151,8 +1151,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (l_ctx && flctx &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock))) {
- do_flush |= l_ctx->lockowner.l_owner != current->files
- || l_ctx->lockowner.l_pid != current->tgid;
+ do_flush |= l_ctx->lockowner != current->files;
}
nfs_release_request(req);
if (!do_flush)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index fd8c9a5bcac4..420d3a0ab258 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -9,7 +9,7 @@
#include <net/netns/generic.h>
#include <linux/fs.h>
-static int grace_net_id;
+static unsigned int grace_net_id;
static DEFINE_SPINLOCK(grace_lock);
/**
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index c16bf5af6831..34c1c449fddf 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -10,7 +10,7 @@
#include <linux/module.h>
#include <linux/nsproxy.h>
#include <linux/sunrpc/addr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "state.h"
#include "netns.h"
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ee36efd5aece..3714231a9d0f 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -124,5 +124,5 @@ struct nfsd_net {
/* Simple check to find out if a given net was properly initialized */
#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
-extern int nfsd_net_id;
+extern unsigned int nfsd_net_id;
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 211dc2aed8e1..eb78109d666c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1061,7 +1061,7 @@ static const struct rpc_call_ops nfsd4_cb_ops = {
int nfsd4_create_callback_queue(void)
{
- callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+ callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0);
if (!callback_wq)
return -ENOMEM;
return 0;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 42aace4fc4c8..1fc07a9c70e9 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -223,10 +223,11 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_layout_stateid *ls;
struct nfs4_stid *stp;
- stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+ stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache,
+ nfsd4_free_layout_stateid);
if (!stp)
return NULL;
- stp->sc_free = nfsd4_free_layout_stateid;
+
get_nfs4_file(fp);
stp->sc_file = fp;
@@ -686,10 +687,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
return 0;
}
/* Fallthrough */
- case -NFS4ERR_NOMATCHING_LAYOUT:
- trace_layout_recall_done(&ls->ls_stid.sc_stateid);
- task->tk_status = 0;
- return 1;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
@@ -702,6 +699,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
else
nfsd4_cb_layout_fail(ls);
return -1;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+ task->tk_status = 0;
+ return 1;
}
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index abb09b580389..74a6e573e061 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -96,33 +96,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct dentry *dentry = cstate->current_fh.fh_dentry;
- /*
- * Check about attributes are supported by the NFSv4 server or not.
- * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
- */
- if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
- (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
- (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+ if (!nfsd_attrs_supported(cstate->minorversion, bmval))
return nfserr_attrnotsupp;
-
- /*
- * Check FATTR4_WORD0_ACL can be supported
- * in current environment or not.
- */
- if (bmval[0] & FATTR4_WORD0_ACL) {
- if (!IS_POSIXACL(d_inode(dentry)))
- return nfserr_attrnotsupp;
- }
-
- /*
- * According to spec, read-only attributes return ERR_INVAL.
- */
- if (writable) {
- if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
- (bmval[2] & ~writable[2]))
- return nfserr_inval;
- }
-
+ if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
+ return nfserr_attrnotsupp;
+ if (writable && !bmval_is_subset(bmval, writable))
+ return nfserr_inval;
+ if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
+ (bmval[1] & FATTR4_WORD1_MODE))
+ return nfserr_inval;
return nfs_ok;
}
@@ -695,9 +677,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
return nfserr_inval;
- getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
- getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
- getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+ getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+ getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+ getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
getattr->ga_fhp = &cstate->current_fh;
return nfs_ok;
@@ -799,9 +781,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
return nfserr_inval;
- readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
- readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
- readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+ readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+ readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+ readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
if ((cookie == 1) || (cookie == 2) ||
(cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4b4beaaa4eaa..a0dee8ae9f97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -633,8 +633,8 @@ out:
return co;
}
-struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
- struct kmem_cache *slab)
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+ void (*sc_free)(struct nfs4_stid *))
{
struct nfs4_stid *stid;
int new_id;
@@ -650,6 +650,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
idr_preload_end();
if (new_id < 0)
goto out_free;
+
+ stid->sc_free = sc_free;
stid->sc_client = cl;
stid->sc_stateid.si_opaque.so_id = new_id;
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
@@ -675,15 +677,12 @@ out_free:
static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
{
struct nfs4_stid *stid;
- struct nfs4_ol_stateid *stp;
- stid = nfs4_alloc_stid(clp, stateid_slab);
+ stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid);
if (!stid)
return NULL;
- stp = openlockstateid(stid);
- stp->st_stid.sc_free = nfs4_free_ol_stateid;
- return stp;
+ return openlockstateid(stid);
}
static void nfs4_free_deleg(struct nfs4_stid *stid)
@@ -781,11 +780,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
goto out_dec;
if (delegation_blocked(&current_fh->fh_handle))
goto out_dec;
- dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
if (dp == NULL)
goto out_dec;
- dp->dl_stid.sc_free = nfs4_free_deleg;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -5580,7 +5578,6 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
get_nfs4_file(fp);
stp->st_stid.sc_file = fp;
- stp->st_stid.sc_free = nfs4_free_lock_stateid;
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
@@ -5623,7 +5620,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
lst = find_lock_stateid(lo, fi);
if (lst == NULL) {
spin_unlock(&clp->cl_lock);
- ns = nfs4_alloc_stid(clp, stateid_slab);
+ ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
if (ns == NULL)
return NULL;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c2d2895a1ec1..8fae53ce21d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,6 +33,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
@@ -57,6 +58,20 @@
#define NFSDDBG_FACILITY NFSDDBG_XDR
+u32 nfsd_suppattrs[3][3] = {
+ {NFSD4_SUPPORTED_ATTRS_WORD0,
+ NFSD4_SUPPORTED_ATTRS_WORD1,
+ NFSD4_SUPPORTED_ATTRS_WORD2},
+
+ {NFSD4_1_SUPPORTED_ATTRS_WORD0,
+ NFSD4_1_SUPPORTED_ATTRS_WORD1,
+ NFSD4_1_SUPPORTED_ATTRS_WORD2},
+
+ {NFSD4_1_SUPPORTED_ATTRS_WORD0,
+ NFSD4_1_SUPPORTED_ATTRS_WORD1,
+ NFSD4_2_SUPPORTED_ATTRS_WORD2},
+};
+
/*
* As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
* directory in order to indicate to the client that a filesystem boundary is present
@@ -285,7 +300,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
static __be32
nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
struct iattr *iattr, struct nfs4_acl **acl,
- struct xdr_netobj *label)
+ struct xdr_netobj *label, int *umask)
{
int expected_len, len = 0;
u32 dummy32;
@@ -296,6 +311,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if ((status = nfsd4_decode_bitmap(argp, bmval)))
return status;
+ if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+ || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+ || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) {
+ if (nfsd_attrs_supported(argp->minorversion, bmval))
+ return nfserr_inval;
+ return nfserr_attrnotsupp;
+ }
+
READ_BUF(4);
expected_len = be32_to_cpup(p++);
@@ -435,12 +458,18 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
return nfserr_jukebox;
}
#endif
-
- if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
- || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
- || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
- READ_BUF(expected_len - len);
- else if (len != expected_len)
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ if (!umask)
+ goto xdr_error;
+ READ_BUF(8);
+ len += 8;
+ dummy32 = be32_to_cpup(p++);
+ iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO);
+ dummy32 = be32_to_cpup(p++);
+ *umask = dummy32 & S_IRWXUGO;
+ iattr->ia_valid |= ATTR_MODE;
+ }
+ if (len != expected_len)
goto xdr_error;
DECODE_TAIL;
@@ -634,7 +663,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
- &create->cr_acl, &create->cr_label);
+ &create->cr_acl, &create->cr_label,
+ &current->fs->umask);
if (status)
goto out;
@@ -879,13 +909,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
case NFS4_OPEN_NOCREATE:
break;
case NFS4_OPEN_CREATE:
+ current->fs->umask = 0;
READ_BUF(4);
open->op_createmode = be32_to_cpup(p++);
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl, &open->op_label);
+ &open->op_iattr, &open->op_acl, &open->op_label,
+ &current->fs->umask);
if (status)
goto out;
break;
@@ -899,7 +931,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl, &open->op_label);
+ &open->op_iattr, &open->op_acl, &open->op_label,
+ &current->fs->umask);
if (status)
goto out;
break;
@@ -1136,7 +1169,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
if (status)
return status;
return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
- &setattr->sa_acl, &setattr->sa_label);
+ &setattr->sa_acl, &setattr->sa_label, NULL);
}
static __be32
@@ -2340,9 +2373,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
- BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
- BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
- BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
+ BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
if (exp->ex_fslocs.migrated) {
status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
@@ -2409,29 +2440,29 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
- u32 word0 = nfsd_suppattrs0(minorversion);
- u32 word1 = nfsd_suppattrs1(minorversion);
- u32 word2 = nfsd_suppattrs2(minorversion);
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
if (!IS_POSIXACL(dentry->d_inode))
- word0 &= ~FATTR4_WORD0_ACL;
+ supp[0] &= ~FATTR4_WORD0_ACL;
if (!contextsupport)
- word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (!word2) {
+ supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (!supp[2]) {
p = xdr_reserve_space(xdr, 12);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(word0);
- *p++ = cpu_to_be32(word1);
+ *p++ = cpu_to_be32(supp[0]);
+ *p++ = cpu_to_be32(supp[1]);
} else {
p = xdr_reserve_space(xdr, 16);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(word0);
- *p++ = cpu_to_be32(word1);
- *p++ = cpu_to_be32(word2);
+ *p++ = cpu_to_be32(supp[0]);
+ *p++ = cpu_to_be32(supp[1]);
+ *p++ = cpu_to_be32(supp[2]);
}
}
if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -3576,10 +3607,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
if (!p)
return nfserr_resource;
/*
- * XXX: By default, the ->readlink() VFS op will truncate symlinks
- * if they would overflow the buffer. Is this kosher in NFSv4? If
- * not, one easy fix is: if ->readlink() precisely fills the buffer,
- * assume that truncation occurred, and return NFS4ERR_RESOURCE.
+ * XXX: By default, vfs_readlink() will truncate symlinks if they
+ * would overflow the buffer. Is this kosher in NFSv4? If not, one
+ * easy fix is: if vfs_readlink() precisely fills the buffer, assume
+ * that truncation occurred, and return NFS4ERR_RESOURCE.
*/
nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
(char *)p, &maxcount);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 54cde9a5864e..d6b97b424ad1 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -9,6 +9,7 @@
*/
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/sunrpc/addr.h>
#include <linux/highmem.h>
#include <linux/log2.h>
@@ -174,8 +175,12 @@ int nfsd_reply_cache_init(void)
goto out_nomem;
drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
- if (!drc_hashtbl)
- goto out_nomem;
+ if (!drc_hashtbl) {
+ drc_hashtbl = vzalloc(hashsize * sizeof(*drc_hashtbl));
+ if (!drc_hashtbl)
+ goto out_nomem;
+ }
+
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
spin_lock_init(&drc_hashtbl[i].cache_lock);
@@ -204,7 +209,7 @@ void nfsd_reply_cache_shutdown(void)
}
}
- kfree (drc_hashtbl);
+ kvfree(drc_hashtbl);
drc_hashtbl = NULL;
drc_hashsize = 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 36b2af931e06..f3b2f34b10a3 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -217,7 +217,7 @@ static const struct file_operations pool_stats_operations = {
.release = nfsd_pool_stats_release,
};
-static struct file_operations reply_cache_stats_operations = {
+static const struct file_operations reply_cache_stats_operations = {
.open = nfsd_reply_cache_stats_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void)
}
#endif
-int nfsd_net_id;
+unsigned int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9446849888d5..d74c8c44dc35 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -359,44 +359,46 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+ FATTR4_WORD2_MODE_UMASK | \
NFSD4_2_SECURITY_ATTRS)
-static inline u32 nfsd_suppattrs0(u32 minorversion)
-{
- return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
- : NFSD4_SUPPORTED_ATTRS_WORD0;
-}
+extern u32 nfsd_suppattrs[3][3];
-static inline u32 nfsd_suppattrs1(u32 minorversion)
+static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
{
- return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
- : NFSD4_SUPPORTED_ATTRS_WORD1;
+ return !((bm1[0] & ~bm2[0]) ||
+ (bm1[1] & ~bm2[1]) ||
+ (bm1[2] & ~bm2[2]));
}
-static inline u32 nfsd_suppattrs2(u32 minorversion)
+static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
{
- switch (minorversion) {
- default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
- case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
- case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
- }
+ return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
}
/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
#define NFSD_WRITEONLY_ATTRS_WORD1 \
(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
-/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+/*
+ * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add
+ * a writeable attribute here without also adding code to parse it to
+ * nfsd4_decode_fattr().
+ */
#define NFSD_WRITEABLE_ATTRS_WORD0 \
(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
+ FATTR4_WORD2_SECURITY_LABEL
#else
-#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0
#endif
+#define NFSD_WRITEABLE_ATTRS_WORD2 \
+ (FATTR4_WORD2_MODE_UMASK \
+ | MAYBE_FATTR4_WORD2_SECURITY_LABEL)
#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a2b65fc56dd6..e6bfd96734c0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,8 +661,8 @@ nfsd(void *vrqstp)
mutex_lock(&nfsd_mutex);
/* At this point, the thread shares current->fs
- * with the init process. We need to create files with a
- * umask of 0 instead of init's umask. */
+ * with the init process. We need to create files with the
+ * umask as defined by the client instead of init's umask. */
if (unshare_fs_struct() < 0) {
printk("Unable to start nfsd thread: out of memory\n");
goto out;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index c9399366f9df..4516e8b7d776 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -603,8 +603,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
-struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
- struct kmem_cache *slab);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+ void (*sc_free)(struct nfs4_stid *));
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8ca642fe9b21..26c6fdb4bf67 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,7 +26,7 @@
#include <linux/jhash.h>
#include <linux/ima.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
@@ -509,8 +509,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
- return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
- count));
+ return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count));
}
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@ -1451,7 +1450,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32
nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
{
- struct inode *inode;
mm_segment_t oldfs;
__be32 err;
int host_err;
@@ -1463,10 +1461,9 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
- inode = d_inode(path.dentry);
err = nfserr_inval;
- if (!inode->i_op->readlink)
+ if (!d_is_symlink(path.dentry))
goto out;
touch_atime(&path);
@@ -1475,7 +1472,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
*/
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
+ host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp);
set_fs(oldfs);
if (host_err < 0)
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 2b71c60fe982..515d13c196da 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -568,7 +568,6 @@ const struct inode_operations nilfs_special_inode_operations = {
};
const struct inode_operations nilfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.permission = nilfs_permission,
};
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c95d369e90aa..12eeae62a2b1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
set_buffer_dirty(nilfs->ns_sbh[0]);
if (nilfs_test_opt(nilfs, BARRIER)) {
err = __sync_dirty_buffer(nilfs->ns_sbh[0],
- WRITE_SYNC | WRITE_FLUSH_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
} else {
err = sync_dirty_buffer(nilfs->ns_sbh[0]);
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6faaf710e563..5a4ec309e283 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
struct dnotify_mark *dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e0e5f7c3c99f..bbc175d4213d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmnt_mark,
u32 event_mask,
- void *data, int data_type)
+ const void *data, int data_type)
{
__u32 marks_mask, marks_ignored_mask;
- struct path *path = data;
+ const struct path *path = data;
pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
" data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
}
struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
- struct path *path)
+ const struct path *path)
{
struct fanotify_event_info *event;
@@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *fanotify_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
int ret = 0;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 2a5fb14115df..4500a74f8d38 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
}
struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
- struct path *path);
+ const struct path *path);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index db39de2dd4cb..b41515d3f081 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -86,7 +86,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
}
/* Notify this dentry's parent about a child's events. */
-int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
{
struct dentry *parent;
struct inode *p_inode;
@@ -125,7 +125,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data,
+ __u32 mask, const void *data,
int data_is, u32 cookie,
const unsigned char *file_name)
{
@@ -187,7 +187,7 @@ static int send_to_group(struct inode *to_tell,
* out to all of the registered fsnotify_group. Those groups can then use the
* notification event in whatever means they feel necessary.
*/
-int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
@@ -199,7 +199,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
if (data_is == FSNOTIFY_EVENT_PATH)
- mnt = real_mount(((struct path *)data)->mnt);
+ mnt = real_mount(((const struct path *)data)->mnt);
else
mnt = NULL;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 741077deef3b..a3645249f7ec 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -150,12 +150,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
*/
void fsnotify_unmount_inodes(struct super_block *sb)
{
- struct inode *inode, *next_i, *need_iput = NULL;
+ struct inode *inode, *iput_inode = NULL;
spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
- struct inode *need_iput_tmp;
-
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
/*
* We cannot __iget() an inode in state I_FREEING,
* I_WILL_FREE, or I_NEW which is fine because by that point
@@ -178,49 +176,24 @@ void fsnotify_unmount_inodes(struct super_block *sb)
continue;
}
- need_iput_tmp = need_iput;
- need_iput = NULL;
-
- /* In case fsnotify_inode_delete() drops a reference. */
- if (inode != need_iput_tmp)
- __iget(inode);
- else
- need_iput_tmp = NULL;
+ __iget(inode);
spin_unlock(&inode->i_lock);
-
- /* In case the dropping of a reference would nuke next_i. */
- while (&next_i->i_sb_list != &sb->s_inodes) {
- spin_lock(&next_i->i_lock);
- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
- atomic_read(&next_i->i_count)) {
- __iget(next_i);
- need_iput = next_i;
- spin_unlock(&next_i->i_lock);
- break;
- }
- spin_unlock(&next_i->i_lock);
- next_i = list_next_entry(next_i, i_sb_list);
- }
-
- /*
- * We can safely drop s_inode_list_lock here because either
- * we actually hold references on both inode and next_i or
- * end of list. Also no new inodes will be added since the
- * umount has begun.
- */
spin_unlock(&sb->s_inode_list_lock);
- if (need_iput_tmp)
- iput(need_iput_tmp);
+ if (iput_inode)
+ iput(iput_inode);
/* for each watch, send FS_UNMOUNT and then remove it */
fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
fsnotify_inode_delete(inode);
- iput(inode);
+ iput_inode = inode;
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
+
+ if (iput_inode)
+ iput(iput_inode);
}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..a6f5907a3fee 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie);
extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 2cd900c2c737..19e7ec109a75 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
+ u32 mask, const void *data, int data_type,
const unsigned char *file_name, u32 cookie)
{
struct inotify_inode_mark *i_mark;
@@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group,
if ((inode_mark->mask & FS_EXCL_UNLINK) &&
(data_type == FSNOTIFY_EVENT_PATH)) {
- struct path *path = data;
+ const struct path *path = data;
if (d_unlinked(path->dentry))
return 0;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0bd89e2..6043306e8e21 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
}
}
-void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
-{
- assert_spin_locked(&old->lock);
- new->inode = old->inode;
- new->mnt = old->mnt;
- if (old->group)
- fsnotify_get_group(old->group);
- new->group = old->group;
- new->mask = old->mask;
- new->free_mark = old->free_mark;
-}
-
/*
* Nothing fancy, just initialize lists and locks and counters.
*/
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8718af895eab..8c9fb29c6673 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -118,7 +118,7 @@ again:
return ret;
}
-static int open_related_ns(struct ns_common *ns,
+int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fe251f187ff8..cc91856b5e2d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -29,6 +29,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
+#include <linux/bio.h>
#include "aops.h"
#include "attrib.h"
@@ -764,7 +765,7 @@ lock_retry_remap:
}
// TODO: Instantiate the hole.
// clear_buffer_new(bh);
- // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ // clean_bdev_bh_alias(bh);
ntfs_error(vol->sb, "Writing into sparse regions is "
"not supported yet. Sorry.");
err = -EOPNOTSUPP;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bf72a2c58b75..358ed7e1195a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -30,7 +30,7 @@
#include <linux/writeback.h>
#include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "attrib.h"
#include "bitmap.h"
@@ -740,8 +740,7 @@ map_buffer_cached:
set_buffer_uptodate(bh);
if (unlikely(was_hole)) {
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (bh_end <= pos || bh_pos >= end)
mark_buffer_dirty(bh);
else
@@ -784,7 +783,7 @@ map_buffer_cached:
continue;
}
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* If the buffer is fully outside the write, zero it,
* set it uptodate, and mark it dirty so it gets
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 761f12f7f3ef..353379ff6057 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -27,6 +27,7 @@
#include <linux/buffer_head.h>
#include <linux/bitops.h>
#include <linux/log2.h>
+#include <linux/bio.h>
#include "attrib.h"
#include "aops.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index d3c009626032..b6f402194f02 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -23,6 +23,7 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/bio.h>
#include "attrib.h"
#include "aops.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f72712f6c28d..d4ec0d8961a6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
rec = &el->l_recs[index];
if (new_flags && (rec->e_flags & new_flags)) {
mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
- "extent that already had them",
+ "extent that already had them\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
new_flags);
goto out;
@@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
if (clear_flags && !(rec->e_flags & clear_flags)) {
mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
- "extent that didn't have them",
+ "extent that didn't have them\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
clear_flags);
goto out;
@@ -5713,8 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_refcount_tree *ref_tree = NULL;
if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
if (!refcount_tree_locked) {
ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c5c5b9748ea3..11556b7d93ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)block);
+ /*
+ * The swap code (ab-)uses ->bmap to get a block mapping and then
+ * bypasseѕ the file system for actual I/O. We really can't allow
+ * that on refcounted inodes, so we have to skip out here. And yes,
+ * 0 is the magic code for a bmap error..
+ */
+ if (ocfs2_is_refcount_inode(inode))
+ return 0;
+
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
*/
@@ -630,7 +639,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
if (PageUptodate(page)) {
@@ -1950,8 +1959,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ loff_t pos, unsigned len, unsigned copied, void *fsdata)
{
int i, ret;
unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@ -2064,7 +2072,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
int ret;
struct inode *inode = mapping->host;
- ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+ ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 1);
@@ -2241,7 +2249,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
dwc->dw_zero_count++;
}
- ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
BUG_ON(ret != len);
ret = 0;
unlock:
@@ -2254,10 +2262,10 @@ out:
return ret;
}
-static void ocfs2_dio_end_io_write(struct inode *inode,
- struct ocfs2_dio_write_ctxt *dwc,
- loff_t offset,
- ssize_t bytes)
+static int ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
{
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_extent_tree et;
@@ -2308,7 +2316,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode,
mlog_errno(ret);
}
- di = (struct ocfs2_dinode *)di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
@@ -2365,6 +2373,8 @@ out:
if (locked)
inode_unlock(inode);
ocfs2_dio_free_write_ctx(inode, dwc);
+
+ return ret;
}
/*
@@ -2379,21 +2389,19 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
-
- if (bytes <= 0)
- return 0;
+ int ret = 0;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (private)
- ocfs2_dio_end_io_write(inode, private, offset, bytes);
+ if (bytes > 0 && private)
+ ret = ocfs2_dio_end_io_write(inode, private, offset, bytes);
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
- return 0;
+ return ret;
}
static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index b1c9f28a57b1..8614ff069d99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle,
struct buffer_head *bh));
int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
+ loff_t pos, unsigned len, unsigned copied, void *fsdata);
typedef enum {
OCFS2_WRITE_BUFFER = 0,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 8f040f88ade4..d9ebe11c8990 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -26,6 +26,7 @@
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
+#include <linux/bio.h>
#include <cluster/masklog.h>
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd4650..f6e871760f8d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
slot = o2nm_this_node();
bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
memset(hb_block, 0, reg->hr_block_bytes);
/* TODO: time stuff */
- cputime = CURRENT_TIME.tv_sec;
+ cputime = ktime_get_real_seconds();
if (!cputime)
cputime = 1;
@@ -1250,7 +1250,7 @@ static int o2hb_thread(void *data)
mlog(ML_HEARTBEAT,
"start = %lld, end = %lld, msec = %u, ret = %d\n",
- before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
+ before_hb, after_hb, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index dfe162f5fd4c..d331c2386b94 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -24,7 +24,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "masklog.h"
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8abab16b4602..d4b5c81f0445 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -62,7 +62,7 @@
#include <linux/export.h>
#include <net/tcp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "heartbeat.h"
#include "tcp.h"
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3f828a187049..a464c8088170 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1609,8 +1609,6 @@ way_up_top:
__dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "mle was found\n");
- set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1625,8 +1623,7 @@ way_up_top:
response = DLM_MASTER_RESP_NO;
} else
response = DLM_MASTER_RESP_MAYBE;
- if (set_maybe)
- set_bit(request->node_idx, tmpmle->maybe_map);
+ set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
}
spin_unlock(&dlm->master_lock);
@@ -1644,12 +1641,6 @@ send_response:
* dlm_assert_master_worker() isn't called, we drop it here.
*/
if (dispatch_assert) {
- if (response != DLM_MASTER_RESP_YES)
- mlog(ML_ERROR, "invalid response %d\n", response);
- if (!res) {
- mlog(ML_ERROR, "bad lockres while trying to assert!\n");
- BUG();
- }
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
spin_lock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index dd5cb8bcefd1..74407c6dd592 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->spinlock);
dlm_kick_recovery_thread(dlm);
break;
- default:
- BUG();
}
mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1079fae5aa12..9ab9e1892b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -45,7 +45,7 @@
#include <linux/backing-dev.h>
#include <linux/poll.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "stackglue.h"
#include "userdlm.h"
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 83d576f6a287..77d1632e905d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
+ /*
+ * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+ * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+ * we can recover correctly from node failure. Otherwise, we may get
+ * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+ */
+ if (!ocfs2_is_o2cb_active() &&
+ lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+ lvb = 1;
+
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 000c234d7bbd..c4889655d32b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1030,7 +1030,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
* Only quota files call this without a bh, and they can't be
* refcounted.
*/
- BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
*done = ret;
}
-static int ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh, u64 byte_start,
- u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
{
int ret = 0, flags = 0, done = 0, i;
u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -1719,8 +1719,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
* within one cluster(means is not exactly aligned to clustersize).
*/
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
-
+ if (ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
if (ret) {
mlog_errno(ret);
@@ -2036,7 +2035,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
struct super_block *sb = inode->i_sb;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
- !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ !ocfs2_is_refcount_inode(inode) ||
OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
@@ -2440,6 +2439,31 @@ out:
return offset;
}
+static int ocfs2_file_clone_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2479,6 +2503,8 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops = {
@@ -2524,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f22215c..897fd9a2e51d 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c56a7679df93..382401d3e88f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_commit;
}
- di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+ di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5af68fcdf9d3..9b955f732bca 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -181,4 +181,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_
return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
}
+/* Does this inode have the reflink flag set? */
+static inline bool ocfs2_is_refcount_inode(struct inode *inode)
+{
+ return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+}
+
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a244f14c6b87..d5e5fa7f0743 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
*/
seqno++;
os->os_count++;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
@@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
struct ocfs2_orphan_scan *os;
os = &osb->osb_orphan_scan;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 71545ad4628c..429088786e93 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
ret = VM_FAULT_NOPAGE;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
BUG_ON(ret != len);
ret = VM_FAULT_LOCKED;
out:
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 4e8f32eb0bdb..e52a2852d50d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -235,10 +235,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
-
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
-
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
@@ -581,10 +578,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
-
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
-
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
BUG_ON(!context->refcount_loc);
ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8d887c75765c..3b0a10d9b36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct ocfs2_extent_list *fel;
u16 feat;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct timespec64 ts;
*new_fe_bh = NULL;
@@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+ ktime_get_real_ts64(&ts);
fe->i_atime = fe->i_ctime = fe->i_mtime =
- cpu_to_le64(CURRENT_TIME.tv_sec);
+ cpu_to_le64(ts.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
- cpu_to_le32(CURRENT_TIME.tv_nsec);
+ cpu_to_le32(ts.tv_nsec);
fe->i_dtime = 0;
/*
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index e63af7ddfe68..7e5958b0be6b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
- struct timespec os_scantime; /* time this node ran the scan */
+ time64_t os_scantime; /* time this node ran the scan */
u32 os_count; /* tracks node specific scans */
u32 os_seqno; /* tracks cluster wide scans */
atomic_t os_state; /* ACTIVE or INACTIVE */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 87e577a49b0d..cec495a921e3 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -634,7 +634,15 @@ static void qsync_work_fn(struct work_struct *work)
dqi_sync_work.work);
struct super_block *sb = oinfo->dqi_gqinode->i_sb;
- dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ /*
+ * We have to be careful here not to deadlock on s_umount as umount
+ * disabling quotas may be in progress and it waits for this work to
+ * complete. If trylock fails, we'll do the sync next time...
+ */
+ if (down_read_trylock(&sb->s_umount)) {
+ dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ up_read(&sb->s_umount);
+ }
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8a54fd8a4fa5..32c5a40c1257 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -454,7 +454,7 @@ out:
/* Sync changes in local quota file into global quota file and
* reinitialize local quota file.
* The function expects local quota file to be already locked and
- * dqonoff_mutex locked. */
+ * s_umount locked in shared mode. */
static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int type,
struct ocfs2_quota_recovery *rec)
@@ -597,7 +597,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
"slot %u\n", osb->dev_str, slot_num);
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ down_read(&sb->s_umount);
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
@@ -674,7 +674,7 @@ out_put:
break;
}
out:
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ up_read(&sb->s_umount);
kfree(rec);
return status;
}
@@ -840,7 +840,10 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
}
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
- /* dqonoff_mutex protects us against racing with recovery thread... */
+ /*
+ * s_umount held in exclusive mode protects us against racing with
+ * recovery thread...
+ */
if (oinfo->dqi_rec) {
ocfs2_free_quota_recovery(oinfo->dqi_rec);
mark_clean = 0;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19238512a324..f8933cb53d68 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
+#include "file.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -410,7 +411,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
goto out;
}
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
di = (struct ocfs2_dinode *)di_bh->b_data;
*ref_blkno = le64_to_cpu(di->i_refcount_loc);
@@ -478,7 +479,6 @@ again:
if (ret) {
mlog_errno(ret);
ocfs2_unlock_refcount_tree(osb, tree, rw);
- ocfs2_refcount_tree_put(tree);
goto out;
}
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
u32 num_got;
u64 suballoc_loc, first_blkno;
- BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+ BUG_ON(ocfs2_is_refcount_inode(inode));
trace_ocfs2_create_refcount_tree(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -708,7 +708,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode,
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_tree *ref_tree;
- BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+ BUG_ON(ocfs2_is_refcount_inode(inode));
ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
&ref_tree, &ref_root_bh);
@@ -775,7 +775,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
u16 bit = 0;
- if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ if (!ocfs2_is_refcount_inode(inode))
return 0;
BUG_ON(!ref_blkno);
@@ -2299,11 +2299,10 @@ int ocfs2_decrease_refcount(struct inode *inode,
{
int ret;
u64 ref_blkno;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_block(inode, &ref_blkno);
if (ret) {
@@ -2533,7 +2532,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
int *ref_blocks)
{
int ret;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
@@ -2544,7 +2542,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
goto out;
}
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
refcount_loc, &tree);
@@ -3412,14 +3410,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
{
int ret;
u32 cow_start = 0, cow_len = 0;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_cow_context *context = NULL;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
cpos, write_len, max_cpos,
@@ -3629,11 +3626,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
{
int ret;
struct ocfs2_xattr_value_root *xv = vb->vb_xv;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_cow_context *context = NULL;
u32 cow_start, cow_len;
- BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
cpos, write_len, UINT_MAX,
@@ -3696,6 +3692,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_alloc_context *meta_ac = NULL;
+ /* We need to be able to handle at least an extent tree split. */
+ ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
+
ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
ref_ci, ref_root_bh,
p_cluster, num_clusters,
@@ -3807,7 +3806,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
ocfs2_init_dealloc_ctxt(&dealloc);
- if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+ if (!ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_create_refcount_tree(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -3934,6 +3933,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
p_cluster, num_clusters,
meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, num_clusters));
if (ret)
mlog_errno(ret);
@@ -4442,3 +4448,434 @@ out:
return error;
}
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
+{
+ handle_t *handle;
+ int ret;
+
+ dest->i_blocks = ocfs2_inode_sector_count(dest);
+
+ if (newlen <= i_size_read(dest))
+ return 0;
+
+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Extend i_size if needed. */
+ spin_lock(&OCFS2_I(dest)->ip_lock);
+ if (newlen > i_size_read(dest))
+ i_size_write(dest, newlen);
+ spin_unlock(&OCFS2_I(dest)->ip_lock);
+ dest->i_ctime = dest->i_mtime = current_time(dest);
+
+ ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+ return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_extent_tree s_et;
+ struct ocfs2_extent_tree t_et;
+ struct ocfs2_dinode *dis;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_super *osb;
+ loff_t pstart, plen;
+ u32 p_cluster, num_clusters, slast, spos, tpos;
+ unsigned int ext_flags;
+ int ret = 0;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+ while (spos < slast) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ /* Look up the extent. */
+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_clusters = min_t(u32, num_clusters, slast - spos);
+
+ /* Punch out the dest range. */
+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (p_cluster == 0)
+ goto next_loop;
+
+ /* Lock the refcount btree... */
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(dis->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Mark s_inode's extent as refcounted. */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, spos,
+ p_cluster, num_clusters,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+ }
+
+ /* Map in the new extent. */
+ ext_flags |= OCFS2_EXT_REFCOUNTED;
+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ tpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+next_loop:
+ spos += num_clusters;
+ tpos += num_clusters;
+ }
+
+out:
+ return ret;
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb;
+ struct ocfs2_dinode *dis;
+ struct ocfs2_dinode *dit;
+ int ret;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ dit = (struct ocfs2_dinode *)t_bh->b_data;
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /*
+ * If we're reflinking the entire file and the source is inline
+ * data, just copy the contents.
+ */
+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+ i_size_read(t_inode) <= len &&
+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If both inodes belong to two different refcount groups then
+ * forget it because we don't know how (or want) to go merging
+ * refcount trees.
+ */
+ ret = -EOPNOTSUPP;
+ if (ocfs2_is_refcount_inode(s_inode) &&
+ ocfs2_is_refcount_inode(t_inode) &&
+ le64_to_cpu(dis->i_refcount_loc) !=
+ le64_to_cpu(dit->i_refcount_loc))
+ goto out;
+
+ /* Neither inode has a refcount tree. Add one to s_inode. */
+ if (!ocfs2_is_refcount_inode(s_inode) &&
+ !ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Ensure that both inodes end up with the same refcount tree. */
+ if (!ocfs2_is_refcount_inode(s_inode)) {
+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+ le64_to_cpu(dit->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ if (!ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(dis->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Turn off inline data in the dest file. */
+ if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Actually remap extents now. */
+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+ pos_out, len, &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
+{
+ struct inode *inode1;
+ struct inode *inode2;
+ struct ocfs2_inode_info *oi1;
+ struct ocfs2_inode_info *oi2;
+ bool same_inode = (s_inode == t_inode);
+ int status;
+
+ /* First grab the VFS and rw locks. */
+ lock_two_nondirectories(s_inode, t_inode);
+ inode1 = s_inode;
+ inode2 = t_inode;
+ if (inode1->i_ino > inode2->i_ino)
+ swap(inode1, inode2);
+
+ status = ocfs2_rw_lock(inode1, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i1;
+ }
+ if (!same_inode) {
+ status = ocfs2_rw_lock(inode2, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i2;
+ }
+ }
+
+ /* Now go for the cluster locks */
+ oi1 = OCFS2_I(inode1);
+ oi2 = OCFS2_I(inode2);
+
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ if (*bh1)
+ *bh1 = NULL;
+ if (*bh2)
+ *bh2 = NULL;
+
+ /* We always want to lock the one with the lower lockid first. */
+ if (oi1->ip_blkno > oi2->ip_blkno)
+ mlog_errno(-ENOLCK);
+
+ /* lock id1 */
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_rw2;
+ }
+
+ /* lock id2 */
+ if (!same_inode) {
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_cl1;
+ }
+ } else
+ *bh2 = *bh1;
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+ return 0;
+
+out_cl1:
+ ocfs2_inode_unlock(inode1, 1);
+ brelse(*bh1);
+ *bh1 = NULL;
+out_rw2:
+ ocfs2_rw_unlock(inode2, 1);
+out_i2:
+ ocfs2_rw_unlock(inode1, 1);
+out_i1:
+ unlock_two_nondirectories(s_inode, t_inode);
+ return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ ocfs2_inode_unlock(s_inode, 1);
+ ocfs2_rw_unlock(s_inode, 1);
+ brelse(s_bh);
+ if (s_inode != t_inode) {
+ ocfs2_inode_unlock(t_inode, 1);
+ ocfs2_rw_unlock(t_inode, 1);
+ brelse(t_bh);
+ }
+ unlock_two_nondirectories(s_inode, t_inode);
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ bool same_inode = (inode_in == inode_out);
+ ssize_t ret;
+
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ /* Check file eligibility and prepare for block sharing. */
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+ &len, is_dedupe);
+ if (ret <= 0)
+ goto out_unlock;
+
+ /* Lock out changes to the allocation maps and remap. */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+ out_bh, pos_out, len);
+
+ /* Zap any page cache for the destination file's range. */
+ if (!ret)
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return 0;
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 6422bbcdb525..4af55bf4b35b 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe);
+
#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c9e828ec3c8e..dae9eb7c441e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -24,7 +24,7 @@
#include <linux/slab.h>
#include <linux/reboot.h>
#include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "stackglue.h"
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 52c07346bea3..820359096c7a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
*/
static struct ocfs2_stack_plugin *active_stack;
+inline int ocfs2_is_o2cb_active(void)
+{
+ return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
+}
+EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
+
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
{
struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index f2dce10fae54..e3036e1790e8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
+int ocfs2_is_o2cb_active(void);
+
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f56fe39fab04..a24e42f95341 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
out += snprintf(buf + out, len - out, "Disabled\n");
else
out += snprintf(buf + out, len - out, "%lu seconds ago\n",
- (get_seconds() - os->os_scantime.tv_sec));
+ (unsigned long)(ktime_get_seconds() - os->os_scantime));
out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");
@@ -985,7 +985,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
- /* Cancel periodic syncing before we grab dqonoff_mutex */
oinfo = sb_dqinfo(sb, type)->dqi_priv;
cancel_delayed_work_sync(&oinfo->dqi_sync_work);
inode = igrab(sb->s_dquot.files[type]);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 6ad8eecefe21..94cfacc9bad7 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -87,7 +87,6 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
};
const struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index cb157a34a656..3c5384d9b3a5 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2577,7 +2577,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return 0;
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+ if (ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
le64_to_cpu(di->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
@@ -3608,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode,
}
/* Check whether the value is refcounted and do some preparation. */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+ if (ocfs2_is_refcount_inode(inode) &&
(!xis.not_found || !xbs.not_found)) {
ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
&xis, &xbs, &ref_tree,
diff --git a/fs/open.c b/fs/open.c
index d3ed8171e8e0..9921f70bc5ca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -19,7 +19,7 @@
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c003a667ed1a..13215f26e321 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -16,7 +16,7 @@
#include <asm/openprom.h>
#include <asm/oplib.h>
#include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static DEFINE_MUTEX(op_mutex);
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index f419dd999581..c4ab6fdf17a0 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -355,7 +355,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
__u64 tag;
} head;
int total = ret = iov_iter_count(iter);
- int n;
int downcall_size = sizeof(struct orangefs_downcall_s);
int head_size = sizeof(head);
@@ -372,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
return -EFAULT;
}
- n = copy_from_iter(&head, head_size, iter);
- if (n < head_size) {
+ if (!copy_from_iter_full(&head, head_size, iter)) {
gossip_err("%s: failed to copy head.\n", __func__);
return -EFAULT;
}
@@ -408,8 +406,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
return ret;
}
- n = copy_from_iter(&op->downcall, downcall_size, iter);
- if (n != downcall_size) {
+ if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) {
gossip_err("%s: failed to copy downcall.\n", __func__);
goto Efault;
}
@@ -463,10 +460,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
goto Enomem;
}
memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
- n = copy_from_iter(op->downcall.trailer_buf,
- op->downcall.trailer_size,
- iter);
- if (n != op->downcall.trailer_size) {
+ if (!copy_from_iter_full(op->downcall.trailer_buf,
+ op->downcall.trailer_size, iter)) {
gossip_err("%s: failed to copy trailer.\n", __func__);
vfree(op->downcall.trailer_buf);
goto Efault;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 02cc6139ec90..e6bbc8083d77 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -724,7 +724,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
{
int rc = -EINVAL;
- if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+ if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
if (cmd == F_GETLK) {
rc = 0;
posix_test_lock(filp, fl);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 462d10933e48..5cd617980fbf 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -8,6 +8,7 @@
* Linux VFS inode operations.
*/
+#include <linux/bvec.h>
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 0748a26598fc..791912da97d7 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -434,6 +434,7 @@ static ssize_t orangefs_debug_write(struct file *file,
char *debug_string;
struct orangefs_kernel_op_s *new_op = NULL;
struct client_debug_mask c_mask = { NULL, 0, 0 };
+ char *s;
gossip_debug(GOSSIP_DEBUGFS_DEBUG,
"orangefs_debug_write: %pD\n",
@@ -521,8 +522,9 @@ static ssize_t orangefs_debug_write(struct file *file,
}
mutex_lock(&orangefs_debug_lock);
- memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
- sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+ s = file_inode(file)->i_private;
+ memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+ sprintf(s, "%s\n", debug_string);
mutex_unlock(&orangefs_debug_lock);
*ppos += count;
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 10b0b06e075e..02b1bbdbcc42 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -9,7 +9,6 @@
#include "orangefs-bufmap.h"
const struct inode_operations orangefs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 34355818a2e0..0daac5112f7a 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -8,3 +8,17 @@ config OVERLAY_FS
merged with the 'upper' object.
For more information see Documentation/filesystems/overlayfs.txt
+
+config OVERLAY_FS_REDIRECT_DIR
+ bool "Overlayfs: turn on redirect dir feature by default"
+ depends on OVERLAY_FS
+ help
+ If this config option is enabled then overlay filesystems will use
+ redirects when renaming directories by default. In this case it is
+ still possible to turn off redirects globally with the
+ "redirect_dir=off" module option or on a filesystem instance basis
+ with the "redirect_dir=off" mount option.
+
+ Note, that redirects are not backward compatible. That is, mounting
+ an overlay which has redirects on a kernel that doesn't support this
+ feature will have unexpected results.
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 900daed3e91d..99373bbc1478 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_OVERLAY_FS) += overlay.o
-overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
+overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 36795eed40b0..f57043dace62 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -33,7 +33,7 @@ static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
- if (f->f_inode == d_inode(dentry))
+ if (file_inode(f) == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
@@ -153,6 +153,13 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
goto out_fput;
}
+ /* Try to use clone_file_range to clone up within the same fs */
+ error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
+ if (!error)
+ goto out;
+ /* Couldn't clone, so now we try to copy the data */
+ error = 0;
+
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
@@ -177,7 +184,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
len -= bytes;
}
-
+out:
if (!error)
error = vfs_fsync(new_file, 0);
fput(new_file);
@@ -231,10 +238,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
- umode_t mode = stat->mode;
int err;
const struct cred *old_creds = NULL;
struct cred *new_creds = NULL;
+ struct cattr cattr = {
+ /* Can't properly set mode on creation because of the umask */
+ .mode = stat->mode & S_IFMT,
+ .rdev = stat->rdev,
+ .link = link
+ };
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
@@ -254,10 +266,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (new_creds)
old_creds = override_creds(new_creds);
- /* Can't properly set mode on creation because of the umask */
- stat->mode &= S_IFMT;
- err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
- stat->mode = mode;
+ err = ovl_create_real(wdir, newdentry, &cattr, NULL, true);
if (new_creds) {
revert_creds(old_creds);
@@ -296,12 +305,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
ovl_dentry_update(dentry, newdentry);
ovl_inode_update(d_inode(dentry), d_inode(newdentry));
newdentry = NULL;
-
- /*
- * Non-directores become opaque when copied up.
- */
- if (!S_ISDIR(stat->mode))
- ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
@@ -317,20 +320,14 @@ out_cleanup:
/*
* Copy up a single dentry
*
- * Directory renames only allowed on "pure upper" (already created on
- * upper filesystem, never copied up). Directories which are on lower or
- * are merged may not be renamed. For these -EXDEV is returned and
- * userspace has to deal with it. This means, when copying up a
- * directory we can rely on it and ancestors being stable.
- *
- * Non-directory renames start with copy up of source if necessary. The
- * actual rename will only proceed once the copy up was successful. Copy
- * up uses upper parent i_mutex for exclusion. Since rename can change
- * d_parent it is possible that the copy up will lock the old parent. At
- * that point the file will have already been copied up anyway.
+ * All renames start with copy up of source if necessary. The actual
+ * rename will only proceed once the copy up was successful. Copy up uses
+ * upper parent i_mutex for exclusion. Since rename can change d_parent it
+ * is possible that the copy up will lock the old parent. At that point
+ * the file will have already been copied up anyway.
*/
-int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat)
+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat)
{
DEFINE_DELAYED_CALL(done);
struct dentry *workdir = ovl_workdir(dentry);
@@ -339,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path parentpath;
struct dentry *lowerdentry = lowerpath->dentry;
struct dentry *upperdir;
- struct dentry *upperdentry;
const char *link = NULL;
if (WARN_ON(!workdir))
@@ -365,8 +361,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry) {
+ if (ovl_dentry_upper(dentry)) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
@@ -385,7 +380,7 @@ out_unlock:
return err;
}
-int ovl_copy_up(struct dentry *dentry)
+int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
@@ -415,6 +410,9 @@ int ovl_copy_up(struct dentry *dentry)
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
+ /* maybe truncate regular file. this has no effect on dirs */
+ if (flags & O_TRUNC)
+ stat.size = 0;
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
@@ -425,3 +423,8 @@ int ovl_copy_up(struct dentry *dentry)
return err;
}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+ return ovl_copy_up_flags(dentry, 0);
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 306b6c161840..16e06dd89457 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -12,11 +12,18 @@
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
+#include <linux/module.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/atomic.h>
+#include <linux/ratelimit.h>
#include "overlayfs.h"
+static unsigned short ovl_redirect_max = 256;
+module_param_named(redirect_max, ovl_redirect_max, ushort, 0644);
+MODULE_PARM_DESC(ovl_redirect_max,
+ "Maximum length of absolute redirect xattr value");
+
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
@@ -75,8 +82,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir,
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct kstat *stat, const char *link,
- struct dentry *hardlink, bool debug)
+ struct cattr *attr, struct dentry *hardlink, bool debug)
{
int err;
@@ -86,13 +92,13 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
- switch (stat->mode & S_IFMT) {
+ switch (attr->mode & S_IFMT) {
case S_IFREG:
- err = ovl_do_create(dir, newdentry, stat->mode, debug);
+ err = ovl_do_create(dir, newdentry, attr->mode, debug);
break;
case S_IFDIR:
- err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+ err = ovl_do_mkdir(dir, newdentry, attr->mode, debug);
break;
case S_IFCHR:
@@ -100,11 +106,11 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
- stat->mode, stat->rdev, debug);
+ attr->mode, attr->rdev, debug);
break;
case S_IFLNK:
- err = ovl_do_symlink(dir, newdentry, link, debug);
+ err = ovl_do_symlink(dir, newdentry, attr->link, debug);
break;
default:
@@ -121,20 +127,15 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
return err;
}
-static int ovl_set_opaque(struct dentry *upperdentry)
-{
- return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
-}
-
-static void ovl_remove_opaque(struct dentry *upperdentry)
+static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
{
int err;
- err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
- if (err) {
- pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
- upperdentry->d_name.name, err);
- }
+ err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
+ if (!err)
+ ovl_dentry_set_opaque(dentry);
+
+ return err;
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -182,9 +183,13 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
}
+static bool ovl_type_merge(struct dentry *dentry)
+{
+ return OVL_TYPE_MERGE(ovl_path_type(dentry));
+}
+
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
- struct kstat *stat, const char *link,
- struct dentry *hardlink)
+ struct cattr *attr, struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
@@ -192,7 +197,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
int err;
if (!hardlink && !IS_POSIXACL(udir))
- stat->mode &= ~current_umask();
+ attr->mode &= ~current_umask();
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
@@ -200,10 +205,15 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
- err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+ err = ovl_create_real(udir, newdentry, attr, hardlink, false);
if (err)
goto out_dput;
+ if (ovl_type_merge(dentry->d_parent)) {
+ /* Setting opaque here is just an optimization, allow to fail */
+ ovl_set_opaque(dentry, newdentry);
+ }
+
ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput:
@@ -270,7 +280,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (IS_ERR(opaquedir))
goto out_unlock;
- err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+ err = ovl_create_real(wdir, opaquedir,
+ &(struct cattr){.mode = stat.mode}, NULL, true);
if (err)
goto out_dput;
@@ -278,7 +289,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- err = ovl_set_opaque(opaquedir);
+ err = ovl_set_opaque(dentry, opaquedir);
if (err)
goto out_cleanup;
@@ -370,7 +381,7 @@ out_free:
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
- struct kstat *stat, const char *link,
+ struct cattr *cattr,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
@@ -387,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (!hardlink) {
err = posix_acl_create(dentry->d_parent->d_inode,
- &stat->mode, &default_acl, &acl);
+ &cattr->mode, &default_acl, &acl);
if (err)
return err;
}
@@ -407,7 +418,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (IS_ERR(upper))
goto out_dput;
- err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+ err = ovl_create_real(wdir, newdentry, cattr, hardlink, true);
if (err)
goto out_dput2;
@@ -415,10 +426,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
* mode could have been mutilated due to umask (e.g. sgid directory)
*/
if (!hardlink &&
- !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) {
+ !S_ISLNK(cattr->mode) &&
+ newdentry->d_inode->i_mode != cattr->mode) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
- .ia_mode = stat->mode,
+ .ia_mode = cattr->mode,
};
inode_lock(newdentry->d_inode);
err = notify_change(newdentry, &attr, NULL);
@@ -438,8 +450,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
goto out_cleanup;
}
- if (!hardlink && S_ISDIR(stat->mode)) {
- err = ovl_set_opaque(newdentry);
+ if (!hardlink && S_ISDIR(cattr->mode)) {
+ err = ovl_set_opaque(dentry, newdentry);
if (err)
goto out_cleanup;
@@ -475,8 +487,7 @@ out_cleanup:
}
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
- struct kstat *stat, const char *link,
- struct dentry *hardlink)
+ struct cattr *attr, struct dentry *hardlink)
{
int err;
const struct cred *old_cred;
@@ -494,7 +505,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
override_cred->fsgid = inode->i_gid;
if (!hardlink) {
err = security_dentry_create_files_as(dentry,
- stat->mode, &dentry->d_name, old_cred,
+ attr->mode, &dentry->d_name, old_cred,
override_cred);
if (err) {
put_cred(override_cred);
@@ -504,12 +515,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
put_cred(override_creds(override_cred));
put_cred(override_cred);
- if (!ovl_dentry_is_opaque(dentry))
- err = ovl_create_upper(dentry, inode, stat, link,
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr,
hardlink);
else
- err = ovl_create_over_whiteout(dentry, inode, stat,
- link, hardlink);
+ err = ovl_create_over_whiteout(dentry, inode, attr,
+ hardlink);
}
out_revert_creds:
revert_creds(old_cred);
@@ -528,8 +539,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
{
int err;
struct inode *inode;
- struct kstat stat = {
+ struct cattr attr = {
.rdev = rdev,
+ .link = link,
};
err = ovl_want_write(dentry);
@@ -537,14 +549,14 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
goto out;
err = -ENOMEM;
- inode = ovl_new_inode(dentry->d_sb, mode);
+ inode = ovl_new_inode(dentry->d_sb, mode, rdev);
if (!inode)
goto out_drop_write;
inode_init_owner(inode, dentry->d_parent->d_inode, mode);
- stat.mode = inode->i_mode;
+ attr.mode = inode->i_mode;
- err = ovl_create_or_link(dentry, inode, &stat, link, NULL);
+ err = ovl_create_or_link(dentry, inode, &attr, NULL);
if (err)
iput(inode);
@@ -598,7 +610,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
inode = d_inode(old);
ihold(inode);
- err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old));
+ err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old));
if (err)
iput(inode);
@@ -684,8 +696,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
+ struct dentry *opaquedir = NULL;
int err;
+ /* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */
+ if (is_dir && ovl_dentry_get_redirect(dentry)) {
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
inode_lock_nested(dir, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
@@ -694,14 +715,15 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
goto out_unlock;
err = -ESTALE;
- if (upper == ovl_dentry_upper(dentry)) {
- if (is_dir)
- err = vfs_rmdir(dir, upper);
- else
- err = vfs_unlink(dir, upper, NULL);
- ovl_dentry_version_inc(dentry->d_parent);
- }
- dput(upper);
+ if ((opaquedir && upper != opaquedir) ||
+ (!opaquedir && upper != ovl_dentry_upper(dentry)))
+ goto out_dput_upper;
+
+ if (is_dir)
+ err = vfs_rmdir(dir, upper);
+ else
+ err = vfs_unlink(dir, upper, NULL);
+ ovl_dentry_version_inc(dentry->d_parent);
/*
* Keeping this dentry hashed would mean having to release
@@ -711,34 +733,21 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
*/
if (!err)
d_drop(dentry);
+out_dput_upper:
+ dput(upper);
out_unlock:
inode_unlock(dir);
-
+ dput(opaquedir);
+out:
return err;
}
-static inline int ovl_check_sticky(struct dentry *dentry)
-{
- struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
- struct inode *inode = ovl_dentry_real(dentry)->d_inode;
-
- if (check_sticky(dir, inode))
- return -EPERM;
-
- return 0;
-}
-
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
const struct cred *old_cred;
-
- err = ovl_check_sticky(dentry);
- if (err)
- goto out;
-
err = ovl_want_write(dentry);
if (err)
goto out;
@@ -750,7 +759,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
type = ovl_path_type(dentry);
old_cred = ovl_override_creds(dentry->d_sb);
- if (OVL_TYPE_PURE_UPPER(type))
+ if (!ovl_lower_positive(dentry))
err = ovl_remove_upper(dentry, is_dir);
else
err = ovl_remove_and_whiteout(dentry, is_dir);
@@ -777,13 +786,114 @@ static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
return ovl_do_remove(dentry, true);
}
-static int ovl_rename2(struct inode *olddir, struct dentry *old,
- struct inode *newdir, struct dentry *new,
- unsigned int flags)
+static bool ovl_type_merge_or_lower(struct dentry *dentry)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type);
+}
+
+static bool ovl_can_move(struct dentry *dentry)
+{
+ return ovl_redirect_dir(dentry->d_sb) ||
+ !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry);
+}
+
+static char *ovl_get_redirect(struct dentry *dentry, bool samedir)
+{
+ char *buf, *ret;
+ struct dentry *d, *tmp;
+ int buflen = ovl_redirect_max + 1;
+
+ if (samedir) {
+ ret = kstrndup(dentry->d_name.name, dentry->d_name.len,
+ GFP_KERNEL);
+ goto out;
+ }
+
+ buf = ret = kmalloc(buflen, GFP_TEMPORARY);
+ if (!buf)
+ goto out;
+
+ buflen--;
+ buf[buflen] = '\0';
+ for (d = dget(dentry); !IS_ROOT(d);) {
+ const char *name;
+ int thislen;
+
+ spin_lock(&d->d_lock);
+ name = ovl_dentry_get_redirect(d);
+ if (name) {
+ thislen = strlen(name);
+ } else {
+ name = d->d_name.name;
+ thislen = d->d_name.len;
+ }
+
+ /* If path is too long, fall back to userspace move */
+ if (thislen + (name[0] != '/') > buflen) {
+ ret = ERR_PTR(-EXDEV);
+ spin_unlock(&d->d_lock);
+ goto out_put;
+ }
+
+ buflen -= thislen;
+ memcpy(&buf[buflen], name, thislen);
+ tmp = dget_dlock(d->d_parent);
+ spin_unlock(&d->d_lock);
+
+ dput(d);
+ d = tmp;
+
+ /* Absolute redirect: finished */
+ if (buf[buflen] == '/')
+ break;
+ buflen--;
+ buf[buflen] = '/';
+ }
+ ret = kstrdup(&buf[buflen], GFP_KERNEL);
+out_put:
+ dput(d);
+ kfree(buf);
+out:
+ return ret ? ret : ERR_PTR(-ENOMEM);
+}
+
+static int ovl_set_redirect(struct dentry *dentry, bool samedir)
+{
+ int err;
+ const char *redirect = ovl_dentry_get_redirect(dentry);
+
+ if (redirect && (samedir || redirect[0] == '/'))
+ return 0;
+
+ redirect = ovl_get_redirect(dentry, samedir);
+ if (IS_ERR(redirect))
+ return PTR_ERR(redirect);
+
+ err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT,
+ redirect, strlen(redirect), 0);
+ if (!err) {
+ spin_lock(&dentry->d_lock);
+ ovl_dentry_set_redirect(dentry, redirect);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ kfree(redirect);
+ if (err == -EOPNOTSUPP)
+ ovl_clear_redirect_dir(dentry->d_sb);
+ else
+ pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
+ /* Fall back to userspace copy-up */
+ err = -EXDEV;
+ }
+ return err;
+}
+
+static int ovl_rename(struct inode *olddir, struct dentry *old,
+ struct inode *newdir, struct dentry *new,
+ unsigned int flags)
{
int err;
- enum ovl_path_type old_type;
- enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
@@ -794,7 +904,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
- bool new_is_dir = false;
+ bool new_is_dir = d_is_dir(new);
+ bool samedir = olddir == newdir;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
@@ -804,46 +915,12 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
flags &= ~RENAME_NOREPLACE;
- err = ovl_check_sticky(old);
- if (err)
- goto out;
-
/* Don't copy up directory trees */
- old_type = ovl_path_type(old);
err = -EXDEV;
- if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
+ if (!ovl_can_move(old))
+ goto out;
+ if (!overwrite && !ovl_can_move(new))
goto out;
-
- if (new->d_inode) {
- err = ovl_check_sticky(new);
- if (err)
- goto out;
-
- if (d_is_dir(new))
- new_is_dir = true;
-
- new_type = ovl_path_type(new);
- err = -EXDEV;
- if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
- goto out;
-
- err = 0;
- if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
- if (ovl_dentry_lower(old)->d_inode ==
- ovl_dentry_lower(new)->d_inode)
- goto out;
- }
- if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
- if (ovl_dentry_upper(old)->d_inode ==
- ovl_dentry_upper(new)->d_inode)
- goto out;
- }
- } else {
- if (ovl_dentry_is_opaque(new))
- new_type = __OVL_PATH_UPPER;
- else
- new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
- }
err = ovl_want_write(old);
if (err)
@@ -862,12 +939,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
goto out_drop_write;
}
- old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
- new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
-
old_cred = ovl_override_creds(old->d_sb);
- if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
+ if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
@@ -877,15 +951,15 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
}
if (overwrite) {
- if (old_opaque) {
- if (new->d_inode || !new_opaque) {
+ if (ovl_lower_positive(old)) {
+ if (!ovl_dentry_is_whiteout(new)) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
- } else if (is_dir && !new->d_inode && new_opaque) {
+ } else if (is_dir && ovl_dentry_is_whiteout(new)) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
@@ -896,7 +970,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
trap = lock_rename(new_upperdir, old_upperdir);
-
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
@@ -913,6 +986,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (IS_ERR(newdentry))
goto out_dput_old;
+ old_opaque = ovl_dentry_is_opaque(old);
+ new_opaque = ovl_dentry_is_opaque(new);
+
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
@@ -933,54 +1009,31 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (newdentry == trap)
goto out_dput;
- if (is_dir && !old_opaque && new_opaque) {
- err = ovl_set_opaque(olddentry);
+ if (WARN_ON(olddentry->d_inode == newdentry->d_inode))
+ goto out_dput;
+
+ err = 0;
+ if (is_dir) {
+ if (ovl_type_merge_or_lower(old))
+ err = ovl_set_redirect(old, samedir);
+ else if (!old_opaque && ovl_type_merge(new->d_parent))
+ err = ovl_set_opaque(old, olddentry);
if (err)
goto out_dput;
}
- if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
- err = ovl_set_opaque(newdentry);
+ if (!overwrite && new_is_dir) {
+ if (ovl_type_merge_or_lower(new))
+ err = ovl_set_redirect(new, samedir);
+ else if (!new_opaque && ovl_type_merge(old->d_parent))
+ err = ovl_set_opaque(new, newdentry);
if (err)
goto out_dput;
}
- if (old_opaque || new_opaque) {
- err = ovl_do_rename(old_upperdir->d_inode, olddentry,
- new_upperdir->d_inode, newdentry,
- flags);
- } else {
- /* No debug for the plain case */
- BUG_ON(flags & ~RENAME_EXCHANGE);
- err = vfs_rename(old_upperdir->d_inode, olddentry,
- new_upperdir->d_inode, newdentry,
- NULL, flags);
- }
-
- if (err) {
- if (is_dir && !old_opaque && new_opaque)
- ovl_remove_opaque(olddentry);
- if (!overwrite && new_is_dir && old_opaque && !new_opaque)
- ovl_remove_opaque(newdentry);
+ err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry, flags);
+ if (err)
goto out_dput;
- }
-
- if (is_dir && old_opaque && !new_opaque)
- ovl_remove_opaque(olddentry);
- if (!overwrite && new_is_dir && !old_opaque && new_opaque)
- ovl_remove_opaque(newdentry);
-
- /*
- * Old dentry now lives in different location. Dentries in
- * lowerstack are stale. We cannot drop them here because
- * access to them is lockless. This could be only pure upper
- * or opaque directory - numlower is zero. Or upper non-dir
- * entry - its pureness is tracked by flag opaque.
- */
- if (old_opaque != new_opaque) {
- ovl_dentry_set_opaque(old, new_opaque);
- if (!overwrite)
- ovl_dentry_set_opaque(new, old_opaque);
- }
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
@@ -1009,7 +1062,7 @@ const struct inode_operations ovl_dir_inode_operations = {
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
- .rename = ovl_rename2,
+ .rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7fb53d055537..08643ac44a02 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -13,34 +13,6 @@
#include <linux/posix_acl.h>
#include "overlayfs.h"
-static int ovl_copy_up_truncate(struct dentry *dentry)
-{
- int err;
- struct dentry *parent;
- struct kstat stat;
- struct path lowerpath;
- const struct cred *old_cred;
-
- parent = dget_parent(dentry);
- err = ovl_copy_up(parent);
- if (err)
- goto out_dput_parent;
-
- ovl_path_lower(dentry, &lowerpath);
-
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr(&lowerpath, &stat);
- if (!err) {
- stat.size = 0;
- err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
- }
- revert_creds(old_cred);
-
-out_dput_parent:
- dput(parent);
- return err;
-}
-
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
@@ -64,27 +36,10 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
goto out;
- if (attr->ia_valid & ATTR_SIZE) {
- struct inode *realinode = d_inode(ovl_dentry_real(dentry));
-
- err = -ETXTBSY;
- if (atomic_read(&realinode->i_writecount) < 0)
- goto out_drop_write;
- }
-
err = ovl_copy_up(dentry);
if (!err) {
- struct inode *winode = NULL;
-
upperdentry = ovl_dentry_upper(dentry);
- if (attr->ia_valid & ATTR_SIZE) {
- winode = d_inode(upperdentry);
- err = get_write_access(winode);
- if (err)
- goto out_drop_write;
- }
-
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
@@ -95,11 +50,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
-
- if (winode)
- put_write_access(winode);
}
-out_drop_write:
ovl_drop_write(dentry);
out:
return err;
@@ -302,10 +253,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
err = ovl_want_write(dentry);
if (!err) {
- if (file_flags & O_TRUNC)
- err = ovl_copy_up_truncate(dentry);
- else
- err = ovl_copy_up(dentry);
+ err = ovl_copy_up_flags(dentry, file_flags);
ovl_drop_write(dentry);
}
}
@@ -348,13 +296,12 @@ static const struct inode_operations ovl_file_inode_operations = {
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
- .readlink = generic_readlink,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.update_time = ovl_update_time,
};
-static void ovl_fill_inode(struct inode *inode, umode_t mode)
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_ino = get_next_ino();
inode->i_mode = mode;
@@ -363,8 +310,11 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif
- mode &= S_IFMT;
- switch (mode) {
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &ovl_file_inode_operations;
+ break;
+
case S_IFDIR:
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
@@ -375,26 +325,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
break;
default:
- WARN(1, "illegal file type: %i\n", mode);
- /* Fall through */
-
- case S_IFREG:
- case S_IFSOCK:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
+ init_special_inode(inode, mode, rdev);
break;
}
}
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
{
struct inode *inode;
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode);
+ ovl_fill_inode(inode, mode, rdev);
return inode;
}
@@ -418,7 +361,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
inode = iget5_locked(sb, (unsigned long) realinode,
ovl_inode_test, ovl_inode_set, realinode);
if (inode && inode->i_state & I_NEW) {
- ovl_fill_inode(inode, realinode->i_mode);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
set_nlink(inode, realinode->i_nlink);
unlock_new_inode(inode);
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
new file mode 100644
index 000000000000..023bb0b03352
--- /dev/null
+++ b/fs/overlayfs/namei.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+#include "ovl_entry.h"
+
+struct ovl_lookup_data {
+ struct qstr name;
+ bool is_dir;
+ bool opaque;
+ bool stop;
+ bool last;
+ char *redirect;
+};
+
+static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
+ size_t prelen, const char *post)
+{
+ int res;
+ char *s, *next, *buf = NULL;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return 0;
+ goto fail;
+ }
+ buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (res == 0)
+ goto invalid;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+ if (res < 0)
+ goto fail;
+ if (res == 0)
+ goto invalid;
+ if (buf[0] == '/') {
+ for (s = buf; *s++ == '/'; s = next) {
+ next = strchrnul(s, '/');
+ if (s == next)
+ goto invalid;
+ }
+ } else {
+ if (strchr(buf, '/') != NULL)
+ goto invalid;
+
+ memmove(buf + prelen, buf, res);
+ memcpy(buf, d->name.name, prelen);
+ }
+
+ strcat(buf, post);
+ kfree(d->redirect);
+ d->redirect = buf;
+ d->name.name = d->redirect;
+ d->name.len = strlen(d->redirect);
+
+ return 0;
+
+err_free:
+ kfree(buf);
+ return 0;
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
+ goto err_free;
+invalid:
+ pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
+ goto err_free;
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+ int res;
+ char val;
+
+ if (!d_is_dir(dentry))
+ return false;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
+ const char *name, unsigned int namelen,
+ size_t prelen, const char *post,
+ struct dentry **ret)
+{
+ struct dentry *this;
+ int err;
+
+ this = lookup_one_len_unlocked(name, base, namelen);
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ this = NULL;
+ if (err == -ENOENT || err == -ENAMETOOLONG)
+ goto out;
+ goto out_err;
+ }
+ if (!this->d_inode)
+ goto put_and_out;
+
+ if (ovl_dentry_weird(this)) {
+ /* Don't support traversing automounts and other weirdness */
+ err = -EREMOTE;
+ goto out_err;
+ }
+ if (ovl_is_whiteout(this)) {
+ d->stop = d->opaque = true;
+ goto put_and_out;
+ }
+ if (!d_can_lookup(this)) {
+ d->stop = true;
+ if (d->is_dir)
+ goto put_and_out;
+ goto out;
+ }
+ d->is_dir = true;
+ if (!d->last && ovl_is_opaquedir(this)) {
+ d->stop = d->opaque = true;
+ goto out;
+ }
+ err = ovl_check_redirect(this, d, prelen, post);
+ if (err)
+ goto out_err;
+out:
+ *ret = this;
+ return 0;
+
+put_and_out:
+ dput(this);
+ this = NULL;
+ goto out;
+
+out_err:
+ dput(this);
+ return err;
+}
+
+static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
+ struct dentry **ret)
+{
+ /* Counting down from the end, since the prefix can change */
+ size_t rem = d->name.len - 1;
+ struct dentry *dentry = NULL;
+ int err;
+
+ if (d->name.name[0] != '/')
+ return ovl_lookup_single(base, d, d->name.name, d->name.len,
+ 0, "", ret);
+
+ while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
+ const char *s = d->name.name + d->name.len - rem;
+ const char *next = strchrnul(s, '/');
+ size_t thislen = next - s;
+ bool end = !next[0];
+
+ /* Verify we did not go off the rails */
+ if (WARN_ON(s[-1] != '/'))
+ return -EIO;
+
+ err = ovl_lookup_single(base, d, s, thislen,
+ d->name.len - rem, next, &base);
+ dput(dentry);
+ if (err)
+ return err;
+ dentry = base;
+ if (end)
+ break;
+
+ rem -= thislen + 1;
+
+ if (WARN_ON(rem >= d->name.len))
+ return -EIO;
+ }
+ *ret = dentry;
+ return 0;
+}
+
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ BUG_ON(idx < 0);
+ if (idx == 0) {
+ ovl_path_upper(dentry, path);
+ if (path->dentry)
+ return oe->numlower ? 1 : -1;
+ idx++;
+ }
+ BUG_ON(idx > oe->numlower);
+ *path = oe->lowerstack[idx - 1];
+
+ return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_entry *oe;
+ const struct cred *old_cred;
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct path *stack = NULL;
+ struct dentry *upperdir, *upperdentry = NULL;
+ unsigned int ctr = 0;
+ struct inode *inode = NULL;
+ bool upperopaque = false;
+ char *upperredirect = NULL;
+ struct dentry *this;
+ unsigned int i;
+ int err;
+ struct ovl_lookup_data d = {
+ .name = dentry->d_name,
+ .is_dir = false,
+ .opaque = false,
+ .stop = false,
+ .last = !poe->numlower,
+ .redirect = NULL,
+ };
+
+ if (dentry->d_name.len > ofs->namelen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ upperdir = ovl_upperdentry_dereference(poe);
+ if (upperdir) {
+ err = ovl_lookup_layer(upperdir, &d, &upperdentry);
+ if (err)
+ goto out;
+
+ if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+ dput(upperdentry);
+ err = -EREMOTE;
+ goto out;
+ }
+
+ if (d.redirect) {
+ upperredirect = kstrdup(d.redirect, GFP_KERNEL);
+ if (!upperredirect)
+ goto out_put_upper;
+ if (d.redirect[0] == '/')
+ poe = dentry->d_sb->s_root->d_fsdata;
+ }
+ upperopaque = d.opaque;
+ }
+
+ if (!d.stop && poe->numlower) {
+ err = -ENOMEM;
+ stack = kcalloc(ofs->numlower, sizeof(struct path),
+ GFP_TEMPORARY);
+ if (!stack)
+ goto out_put_upper;
+ }
+
+ for (i = 0; !d.stop && i < poe->numlower; i++) {
+ struct path lowerpath = poe->lowerstack[i];
+
+ d.last = i == poe->numlower - 1;
+ err = ovl_lookup_layer(lowerpath.dentry, &d, &this);
+ if (err)
+ goto out_put;
+
+ if (!this)
+ continue;
+
+ stack[ctr].dentry = this;
+ stack[ctr].mnt = lowerpath.mnt;
+ ctr++;
+
+ if (d.stop)
+ break;
+
+ if (d.redirect &&
+ d.redirect[0] == '/' &&
+ poe != dentry->d_sb->s_root->d_fsdata) {
+ poe = dentry->d_sb->s_root->d_fsdata;
+
+ /* Find the current layer on the root dentry */
+ for (i = 0; i < poe->numlower; i++)
+ if (poe->lowerstack[i].mnt == lowerpath.mnt)
+ break;
+ if (WARN_ON(i == poe->numlower))
+ break;
+ }
+ }
+
+ oe = ovl_alloc_entry(ctr);
+ err = -ENOMEM;
+ if (!oe)
+ goto out_put;
+
+ if (upperdentry || ctr) {
+ struct dentry *realdentry;
+ struct inode *realinode;
+
+ realdentry = upperdentry ? upperdentry : stack[0].dentry;
+ realinode = d_inode(realdentry);
+
+ err = -ENOMEM;
+ if (upperdentry && !d_is_dir(upperdentry)) {
+ inode = ovl_get_inode(dentry->d_sb, realinode);
+ } else {
+ inode = ovl_new_inode(dentry->d_sb, realinode->i_mode,
+ realinode->i_rdev);
+ if (inode)
+ ovl_inode_init(inode, realinode, !!upperdentry);
+ }
+ if (!inode)
+ goto out_free_oe;
+ ovl_copyattr(realdentry->d_inode, inode);
+ }
+
+ revert_creds(old_cred);
+ oe->opaque = upperopaque;
+ oe->redirect = upperredirect;
+ oe->__upperdentry = upperdentry;
+ memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+ kfree(stack);
+ kfree(d.redirect);
+ dentry->d_fsdata = oe;
+ d_add(dentry, inode);
+
+ return NULL;
+
+out_free_oe:
+ kfree(oe);
+out_put:
+ for (i = 0; i < ctr; i++)
+ dput(stack[i].dentry);
+ kfree(stack);
+out_put_upper:
+ dput(upperdentry);
+ kfree(upperredirect);
+out:
+ kfree(d.redirect);
+ revert_creds(old_cred);
+ return ERR_PTR(err);
+}
+
+bool ovl_lower_positive(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ const struct qstr *name = &dentry->d_name;
+ unsigned int i;
+ bool positive = false;
+ bool done = false;
+
+ /*
+ * If dentry is negative, then lower is positive iff this is a
+ * whiteout.
+ */
+ if (!dentry->d_inode)
+ return oe->opaque;
+
+ /* Negative upper -> positive lower */
+ if (!oe->__upperdentry)
+ return true;
+
+ /* Positive upper -> have to look up lower to see whether it exists */
+ for (i = 0; !done && !positive && i < poe->numlower; i++) {
+ struct dentry *this;
+ struct dentry *lowerdir = poe->lowerstack[i].dentry;
+
+ this = lookup_one_len_unlocked(name->name, lowerdir,
+ name->len);
+ if (IS_ERR(this)) {
+ switch (PTR_ERR(this)) {
+ case -ENOENT:
+ case -ENAMETOOLONG:
+ break;
+
+ default:
+ /*
+ * Assume something is there, we just couldn't
+ * access it.
+ */
+ positive = true;
+ break;
+ }
+ } else {
+ if (this->d_inode) {
+ positive = !ovl_is_whiteout(this);
+ done = true;
+ }
+ dput(this);
+ }
+ }
+
+ return positive;
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e218e741cb99..8af450b0e57a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,23 +9,17 @@
#include <linux/kernel.h>
-struct ovl_entry;
-
enum ovl_path_type {
- __OVL_PATH_PURE = (1 << 0),
- __OVL_PATH_UPPER = (1 << 1),
- __OVL_PATH_MERGE = (1 << 2),
+ __OVL_PATH_UPPER = (1 << 0),
+ __OVL_PATH_MERGE = (1 << 1),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
-#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
-#define OVL_TYPE_MERGE_OR_LOWER(type) \
- (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
+#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
#define OVL_ISUPPER_MASK 1UL
@@ -143,35 +137,43 @@ static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper)
return (struct inode *) (x & ~OVL_ISUPPER_MASK);
}
+/* util.c */
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+struct dentry *ovl_workdir(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+bool ovl_dentry_remote(struct dentry *dentry);
+bool ovl_dentry_weird(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
-u64 ovl_dentry_version_get(struct dentry *dentry);
-void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
-struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
- bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
-struct dentry *ovl_workdir(struct dentry *dentry);
-int ovl_want_write(struct dentry *dentry);
-void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
-void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
-bool ovl_is_whiteout(struct dentry *dentry);
-const struct cred *ovl_override_creds(struct super_block *sb);
+bool ovl_dentry_is_whiteout(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_redirect_dir(struct super_block *sb);
+void ovl_clear_redirect_dir(struct super_block *sb);
+const char *ovl_dentry_get_redirect(struct dentry *dentry);
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+void ovl_inode_init(struct inode *inode, struct inode *realinode,
+ bool is_upper);
void ovl_inode_update(struct inode *inode, struct inode *upperinode);
-struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags);
+void ovl_dentry_version_inc(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(struct path *path, int flags);
-struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
- struct kstat *stat, const char *link);
+/* namei.c */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags);
+bool ovl_lower_positive(struct dentry *dentry);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
@@ -195,7 +197,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
bool ovl_is_private_xattr(const char *name);
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
@@ -210,14 +212,18 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+struct cattr {
+ dev_t rdev;
+ umode_t mode;
+ const char *link;
+};
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct kstat *stat, const char *link,
+ struct cattr *attr,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
-int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat);
+int ovl_copy_up_flags(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
new file mode 100644
index 000000000000..d14bca1850d9
--- /dev/null
+++ b/fs/overlayfs/ovl_entry.h
@@ -0,0 +1,53 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+struct ovl_config {
+ char *lowerdir;
+ char *upperdir;
+ char *workdir;
+ bool default_permissions;
+ bool redirect_dir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+ struct vfsmount *upper_mnt;
+ unsigned numlower;
+ struct vfsmount **lower_mnt;
+ struct dentry *workdir;
+ long namelen;
+ /* pathnames of lower and upper dirs, for show_options */
+ struct ovl_config config;
+ /* creds of process who forced instantiation of super block */
+ const struct cred *creator_cred;
+};
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+ struct dentry *__upperdentry;
+ struct ovl_dir_cache *cache;
+ union {
+ struct {
+ u64 version;
+ const char *redirect;
+ bool opaque;
+ };
+ struct rcu_head rcu;
+ };
+ unsigned numlower;
+ struct path lowerstack[];
+};
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+
+static inline struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+ return lockless_dereference(oe->__upperdentry);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 0e100856c7b8..20f48abbb82f 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,280 +9,29 @@
#include <linux/fs.h>
#include <linux/namei.h>
-#include <linux/pagemap.h>
#include <linux/xattr.h>
-#include <linux/security.h>
#include <linux/mount.h>
-#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
+#include "ovl_entry.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Overlay filesystem");
MODULE_LICENSE("GPL");
-struct ovl_config {
- char *lowerdir;
- char *upperdir;
- char *workdir;
- bool default_permissions;
-};
-
-/* private information held for overlayfs's superblock */
-struct ovl_fs {
- struct vfsmount *upper_mnt;
- unsigned numlower;
- struct vfsmount **lower_mnt;
- struct dentry *workdir;
- long lower_namelen;
- /* pathnames of lower and upper dirs, for show_options */
- struct ovl_config config;
- /* creds of process who forced instantiation of super block */
- const struct cred *creator_cred;
-};
struct ovl_dir_cache;
-/* private information held for every overlayfs dentry */
-struct ovl_entry {
- struct dentry *__upperdentry;
- struct ovl_dir_cache *cache;
- union {
- struct {
- u64 version;
- bool opaque;
- };
- struct rcu_head rcu;
- };
- unsigned numlower;
- struct path lowerstack[];
-};
-
#define OVL_MAX_STACK 500
-static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
-{
- return oe->numlower ? oe->lowerstack[0].dentry : NULL;
-}
-
-enum ovl_path_type ovl_path_type(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
- enum ovl_path_type type = 0;
-
- if (oe->__upperdentry) {
- type = __OVL_PATH_UPPER;
-
- /*
- * Non-dir dentry can hold lower dentry from previous
- * location. Its purity depends only on opaque flag.
- */
- if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- else if (!oe->opaque)
- type |= __OVL_PATH_PURE;
- } else {
- if (oe->numlower > 1)
- type |= __OVL_PATH_MERGE;
- }
- return type;
-}
-
-static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
-{
- return lockless_dereference(oe->__upperdentry);
-}
-
-void ovl_path_upper(struct dentry *dentry, struct path *path)
-{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- struct ovl_entry *oe = dentry->d_fsdata;
-
- path->mnt = ofs->upper_mnt;
- path->dentry = ovl_upperdentry_dereference(oe);
-}
-
-enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
-{
- enum ovl_path_type type = ovl_path_type(dentry);
-
- if (!OVL_TYPE_UPPER(type))
- ovl_path_lower(dentry, path);
- else
- ovl_path_upper(dentry, path);
-
- return type;
-}
-
-struct dentry *ovl_dentry_upper(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- return ovl_upperdentry_dereference(oe);
-}
-
-struct dentry *ovl_dentry_lower(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- return __ovl_dentry_lower(oe);
-}
-
-struct dentry *ovl_dentry_real(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
- struct dentry *realdentry;
-
- realdentry = ovl_upperdentry_dereference(oe);
- if (!realdentry)
- realdentry = __ovl_dentry_lower(oe);
-
- return realdentry;
-}
-
-static void ovl_inode_init(struct inode *inode, struct inode *realinode,
- bool is_upper)
-{
- WRITE_ONCE(inode->i_private, (unsigned long) realinode |
- (is_upper ? OVL_ISUPPER_MASK : 0));
-}
-
-struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
- bool is_upper)
-{
- if (is_upper) {
- struct ovl_fs *ofs = inode->i_sb->s_fs_info;
-
- return ofs->upper_mnt;
- } else {
- return oe->numlower ? oe->lowerstack[0].mnt : NULL;
- }
-}
-
-struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- return oe->cache;
-}
-
-void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- oe->cache = cache;
-}
-
-void ovl_path_lower(struct dentry *dentry, struct path *path)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
-}
-
-int ovl_want_write(struct dentry *dentry)
-{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- return mnt_want_write(ofs->upper_mnt);
-}
-
-void ovl_drop_write(struct dentry *dentry)
-{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- mnt_drop_write(ofs->upper_mnt);
-}
-
-struct dentry *ovl_workdir(struct dentry *dentry)
-{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- return ofs->workdir;
-}
-
-bool ovl_dentry_is_opaque(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
- return oe->opaque;
-}
-
-void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
- oe->opaque = opaque;
-}
-
-void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
- WARN_ON(oe->__upperdentry);
- /*
- * Make sure upperdentry is consistent before making it visible to
- * ovl_upperdentry_dereference().
- */
- smp_wmb();
- oe->__upperdentry = upperdentry;
-}
-
-void ovl_inode_update(struct inode *inode, struct inode *upperinode)
-{
- WARN_ON(!upperinode);
- WARN_ON(!inode_unhashed(inode));
- WRITE_ONCE(inode->i_private,
- (unsigned long) upperinode | OVL_ISUPPER_MASK);
- if (!S_ISDIR(upperinode->i_mode))
- __insert_inode_hash(inode, (unsigned long) upperinode);
-}
-
-void ovl_dentry_version_inc(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- WARN_ON(!inode_is_locked(dentry->d_inode));
- oe->version++;
-}
-
-u64 ovl_dentry_version_get(struct dentry *dentry)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- WARN_ON(!inode_is_locked(dentry->d_inode));
- return oe->version;
-}
-
-bool ovl_is_whiteout(struct dentry *dentry)
-{
- struct inode *inode = dentry->d_inode;
-
- return inode && IS_WHITEOUT(inode);
-}
-
-const struct cred *ovl_override_creds(struct super_block *sb)
-{
- struct ovl_fs *ofs = sb->s_fs_info;
-
- return override_creds(ofs->creator_cred);
-}
-
-static bool ovl_is_opaquedir(struct dentry *dentry)
-{
- int res;
- char val;
-
- if (!d_is_dir(dentry))
- return false;
-
- res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
- if (res == 1 && val == 'y')
- return true;
-
- return false;
-}
+static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
+module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
+MODULE_PARM_DESC(ovl_redirect_dir_def,
+ "Default to on or off for the redirect_dir feature");
static void ovl_dentry_release(struct dentry *dentry)
{
@@ -292,6 +41,7 @@ static void ovl_dentry_release(struct dentry *dentry)
unsigned int i;
dput(oe->__upperdentry);
+ kfree(oe->redirect);
for (i = 0; i < oe->numlower; i++)
dput(oe->lowerstack[i].dentry);
kfree_rcu(oe, rcu);
@@ -304,7 +54,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
{
struct dentry *real;
- if (d_is_dir(dentry)) {
+ if (!d_is_reg(dentry)) {
if (!inode || inode == d_inode(dentry))
return dentry;
goto bug;
@@ -392,226 +142,6 @@ static const struct dentry_operations ovl_reval_dentry_operations = {
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
-static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
-{
- size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
- struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
-
- if (oe)
- oe->numlower = numlower;
-
- return oe;
-}
-
-static bool ovl_dentry_remote(struct dentry *dentry)
-{
- return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_REAL);
-}
-
-static bool ovl_dentry_weird(struct dentry *dentry)
-{
- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
- DCACHE_MANAGE_TRANSIT |
- DCACHE_OP_HASH |
- DCACHE_OP_COMPARE);
-}
-
-static inline struct dentry *ovl_lookup_real(struct dentry *dir,
- const struct qstr *name)
-{
- struct dentry *dentry;
-
- dentry = lookup_one_len_unlocked(name->name, dir, name->len);
-
- if (IS_ERR(dentry)) {
- if (PTR_ERR(dentry) == -ENOENT)
- dentry = NULL;
- } else if (!dentry->d_inode) {
- dput(dentry);
- dentry = NULL;
- } else if (ovl_dentry_weird(dentry)) {
- dput(dentry);
- /* Don't support traversing automounts and other weirdness */
- dentry = ERR_PTR(-EREMOTE);
- }
- return dentry;
-}
-
-/*
- * Returns next layer in stack starting from top.
- * Returns -1 if this is the last layer.
- */
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
-{
- struct ovl_entry *oe = dentry->d_fsdata;
-
- BUG_ON(idx < 0);
- if (idx == 0) {
- ovl_path_upper(dentry, path);
- if (path->dentry)
- return oe->numlower ? 1 : -1;
- idx++;
- }
- BUG_ON(idx > oe->numlower);
- *path = oe->lowerstack[idx - 1];
-
- return (idx < oe->numlower) ? idx + 1 : -1;
-}
-
-struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct ovl_entry *oe;
- const struct cred *old_cred;
- struct ovl_entry *poe = dentry->d_parent->d_fsdata;
- struct path *stack = NULL;
- struct dentry *upperdir, *upperdentry = NULL;
- unsigned int ctr = 0;
- struct inode *inode = NULL;
- bool upperopaque = false;
- struct dentry *this, *prev = NULL;
- unsigned int i;
- int err;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- upperdir = ovl_upperdentry_dereference(poe);
- if (upperdir) {
- this = ovl_lookup_real(upperdir, &dentry->d_name);
- err = PTR_ERR(this);
- if (IS_ERR(this))
- goto out;
-
- if (this) {
- if (unlikely(ovl_dentry_remote(this))) {
- dput(this);
- err = -EREMOTE;
- goto out;
- }
- if (ovl_is_whiteout(this)) {
- dput(this);
- this = NULL;
- upperopaque = true;
- } else if (poe->numlower && ovl_is_opaquedir(this)) {
- upperopaque = true;
- }
- }
- upperdentry = prev = this;
- }
-
- if (!upperopaque && poe->numlower) {
- err = -ENOMEM;
- stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
- if (!stack)
- goto out_put_upper;
- }
-
- for (i = 0; !upperopaque && i < poe->numlower; i++) {
- bool opaque = false;
- struct path lowerpath = poe->lowerstack[i];
-
- this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
- err = PTR_ERR(this);
- if (IS_ERR(this)) {
- /*
- * If it's positive, then treat ENAMETOOLONG as ENOENT.
- */
- if (err == -ENAMETOOLONG && (upperdentry || ctr))
- continue;
- goto out_put;
- }
- if (!this)
- continue;
- if (ovl_is_whiteout(this)) {
- dput(this);
- break;
- }
- /*
- * Only makes sense to check opaque dir if this is not the
- * lowermost layer.
- */
- if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
- opaque = true;
-
- if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
- !S_ISDIR(this->d_inode->i_mode))) {
- /*
- * FIXME: check for upper-opaqueness maybe better done
- * in remove code.
- */
- if (prev == upperdentry)
- upperopaque = true;
- dput(this);
- break;
- }
- /*
- * If this is a non-directory then stop here.
- */
- if (!S_ISDIR(this->d_inode->i_mode))
- opaque = true;
-
- stack[ctr].dentry = this;
- stack[ctr].mnt = lowerpath.mnt;
- ctr++;
- prev = this;
- if (opaque)
- break;
- }
-
- oe = ovl_alloc_entry(ctr);
- err = -ENOMEM;
- if (!oe)
- goto out_put;
-
- if (upperdentry || ctr) {
- struct dentry *realdentry;
- struct inode *realinode;
-
- realdentry = upperdentry ? upperdentry : stack[0].dentry;
- realinode = d_inode(realdentry);
-
- err = -ENOMEM;
- if (upperdentry && !d_is_dir(upperdentry)) {
- inode = ovl_get_inode(dentry->d_sb, realinode);
- } else {
- inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
- if (inode)
- ovl_inode_init(inode, realinode, !!upperdentry);
- }
- if (!inode)
- goto out_free_oe;
- ovl_copyattr(realdentry->d_inode, inode);
- }
-
- revert_creds(old_cred);
- oe->opaque = upperopaque;
- oe->__upperdentry = upperdentry;
- memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
- kfree(stack);
- dentry->d_fsdata = oe;
- d_add(dentry, inode);
-
- return NULL;
-
-out_free_oe:
- kfree(oe);
-out_put:
- for (i = 0; i < ctr; i++)
- dput(stack[i].dentry);
- kfree(stack);
-out_put_upper:
- dput(upperdentry);
-out:
- revert_creds(old_cred);
- return ERR_PTR(err);
-}
-
-struct file *ovl_path_open(struct path *path, int flags)
-{
- return dentry_open(path, flags | O_NOATIME, current_cred());
-}
-
static void ovl_put_super(struct super_block *sb)
{
struct ovl_fs *ufs = sb->s_fs_info;
@@ -649,7 +179,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
err = vfs_statfs(&path, buf);
if (!err) {
- buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+ buf->f_namelen = ofs->namelen;
buf->f_type = OVERLAYFS_SUPER_MAGIC;
}
@@ -674,6 +204,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
}
if (ufs->config.default_permissions)
seq_puts(m, ",default_permissions");
+ if (ufs->config.redirect_dir != ovl_redirect_dir_def)
+ seq_printf(m, ",redirect_dir=%s",
+ ufs->config.redirect_dir ? "on" : "off");
return 0;
}
@@ -700,6 +233,8 @@ enum {
OPT_UPPERDIR,
OPT_WORKDIR,
OPT_DEFAULT_PERMISSIONS,
+ OPT_REDIRECT_DIR_ON,
+ OPT_REDIRECT_DIR_OFF,
OPT_ERR,
};
@@ -708,6 +243,8 @@ static const match_table_t ovl_tokens = {
{OPT_UPPERDIR, "upperdir=%s"},
{OPT_WORKDIR, "workdir=%s"},
{OPT_DEFAULT_PERMISSIONS, "default_permissions"},
+ {OPT_REDIRECT_DIR_ON, "redirect_dir=on"},
+ {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"},
{OPT_ERR, NULL}
};
@@ -772,6 +309,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->default_permissions = true;
break;
+ case OPT_REDIRECT_DIR_ON:
+ config->redirect_dir = true;
+ break;
+
+ case OPT_REDIRECT_DIR_OFF:
+ config->redirect_dir = false;
+ break;
+
default:
pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
@@ -809,12 +354,9 @@ retry:
strlen(OVL_WORKDIR_NAME));
if (!IS_ERR(work)) {
- struct kstat stat = {
- .mode = S_IFDIR | 0,
- };
struct iattr attr = {
.ia_valid = ATTR_MODE,
- .ia_mode = stat.mode,
+ .ia_mode = S_IFDIR | 0,
};
if (work->d_inode) {
@@ -828,7 +370,9 @@ retry:
goto retry;
}
- err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+ err = ovl_create_real(dir, work,
+ &(struct cattr){.mode = S_IFDIR | 0},
+ NULL, true);
if (err)
goto out_dput;
@@ -903,7 +447,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("overlayfs: filesystem on '%s' not supported\n", name);
goto out_put;
}
- if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+ if (!d_is_dir(path->dentry)) {
pr_err("overlayfs: '%s' not a directory\n", name);
goto out_put;
}
@@ -936,22 +480,33 @@ static int ovl_mount_dir(const char *name, struct path *path)
return err;
}
-static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
- int *stack_depth, bool *remote)
+static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
+ const char *name)
{
- int err;
struct kstatfs statfs;
+ int err = vfs_statfs(path, &statfs);
+
+ if (err)
+ pr_err("overlayfs: statfs failed on '%s'\n", name);
+ else
+ ofs->namelen = max(ofs->namelen, statfs.f_namelen);
+
+ return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path,
+ struct ovl_fs *ofs, int *stack_depth, bool *remote)
+{
+ int err;
err = ovl_mount_dir_noesc(name, path);
if (err)
goto out;
- err = vfs_statfs(path, &statfs);
- if (err) {
- pr_err("overlayfs: statfs failed on '%s'\n", name);
+ err = ovl_check_namelen(path, ofs, name);
+ if (err)
goto out_put;
- }
- *namelen = max(*namelen, statfs.f_namelen);
+
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
if (ovl_dentry_remote(path->dentry))
@@ -1067,7 +622,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
- return -EPERM;
+ return -EOPNOTSUPP;
}
static int ovl_own_xattr_set(const struct xattr_handler *handler,
@@ -1075,7 +630,7 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler,
const char *name, const void *value,
size_t size, int flags)
{
- return -EPERM;
+ return -EOPNOTSUPP;
}
static int ovl_other_xattr_get(const struct xattr_handler *handler,
@@ -1153,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ufs)
goto out;
+ ufs->config.redirect_dir = ovl_redirect_dir_def;
err = ovl_parse_opt((char *) data, &ufs->config);
if (err)
goto out_free_config;
@@ -1183,6 +739,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_put_upperpath;
}
+ err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir);
+ if (err)
+ goto out_put_upperpath;
+
err = ovl_mount_dir(ufs->config.workdir, &workpath);
if (err)
goto out_put_upperpath;
@@ -1214,15 +774,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_free_lowertmp;
}
+ err = -ENOMEM;
stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
if (!stack)
goto out_free_lowertmp;
+ err = -EINVAL;
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
- err = ovl_lower_dir(lower, &stack[numlower],
- &ufs->lower_namelen, &sb->s_stack_depth,
- &remote);
+ err = ovl_lower_dir(lower, &stack[numlower], ufs,
+ &sb->s_stack_depth, &remote);
if (err)
goto out_put_lowerpath;
@@ -1324,7 +885,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = ufs;
sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
+ root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
if (!root_dentry)
goto out_free_oe;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
new file mode 100644
index 000000000000..952286f4826c
--- /dev/null
+++ b/fs/overlayfs/util.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+#include "ovl_entry.h"
+
+int ovl_want_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return ofs->workdir;
+}
+
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return override_creds(ofs->creator_cred);
+}
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
+{
+ size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+ struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+ if (oe)
+ oe->numlower = numlower;
+
+ return oe;
+}
+
+bool ovl_dentry_remote(struct dentry *dentry)
+{
+ return dentry->d_flags &
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
+ DCACHE_OP_REAL);
+}
+
+bool ovl_dentry_weird(struct dentry *dentry)
+{
+ return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+ DCACHE_MANAGE_TRANSIT |
+ DCACHE_OP_HASH |
+ DCACHE_OP_COMPARE);
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ enum ovl_path_type type = 0;
+
+ if (oe->__upperdentry) {
+ type = __OVL_PATH_UPPER;
+
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location.
+ */
+ if (oe->numlower && d_is_dir(dentry))
+ type |= __OVL_PATH_MERGE;
+ } else {
+ if (oe->numlower > 1)
+ type |= __OVL_PATH_MERGE;
+ }
+ return type;
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->upper_mnt;
+ path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (!OVL_TYPE_UPPER(type))
+ ovl_path_lower(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return ovl_upperdentry_dereference(oe);
+}
+
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+ return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return __ovl_dentry_lower(oe);
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (!realdentry)
+ realdentry = __ovl_dentry_lower(oe);
+
+ return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ oe->cache = cache;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ return oe->opaque;
+}
+
+bool ovl_dentry_is_whiteout(struct dentry *dentry)
+{
+ return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ oe->opaque = true;
+}
+
+bool ovl_redirect_dir(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->config.redirect_dir;
+}
+
+void ovl_clear_redirect_dir(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ ofs->config.redirect_dir = false;
+}
+
+const char *ovl_dentry_get_redirect(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->redirect;
+}
+
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ kfree(oe->redirect);
+ oe->redirect = redirect;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
+ WARN_ON(oe->__upperdentry);
+ /*
+ * Make sure upperdentry is consistent before making it visible to
+ * ovl_upperdentry_dereference().
+ */
+ smp_wmb();
+ oe->__upperdentry = upperdentry;
+}
+
+void ovl_inode_init(struct inode *inode, struct inode *realinode, bool is_upper)
+{
+ WRITE_ONCE(inode->i_private, (unsigned long) realinode |
+ (is_upper ? OVL_ISUPPER_MASK : 0));
+}
+
+void ovl_inode_update(struct inode *inode, struct inode *upperinode)
+{
+ WARN_ON(!upperinode);
+ WARN_ON(!inode_unhashed(inode));
+ WRITE_ONCE(inode->i_private,
+ (unsigned long) upperinode | OVL_ISUPPER_MASK);
+ if (!S_ISDIR(upperinode->i_mode))
+ __insert_inode_hash(inode, (unsigned long) upperinode);
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!inode_is_locked(dentry->d_inode));
+ oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!inode_is_locked(dentry->d_inode));
+ return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ return inode && IS_WHITEOUT(inode);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+ return dentry_open(path, flags | O_NOATIME, current_cred());
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index 8e0d9f26dfad..73b84baf58f8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -23,7 +23,7 @@
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include "internal.h"
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..06a793f4ae38 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -67,49 +67,47 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
static int do_make_slave(struct mount *mnt)
{
- struct mount *peer_mnt = mnt, *master = mnt->mnt_master;
- struct mount *slave_mnt;
+ struct mount *master, *slave_mnt;
- /*
- * slave 'mnt' to a peer mount that has the
- * same root dentry. If none is available then
- * slave it to anything that is available.
- */
- while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
- peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ;
-
- if (peer_mnt == mnt) {
- peer_mnt = next_peer(mnt);
- if (peer_mnt == mnt)
- peer_mnt = NULL;
- }
- if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
- list_empty(&mnt->mnt_share))
- mnt_release_group_id(mnt);
-
- list_del_init(&mnt->mnt_share);
- mnt->mnt_group_id = 0;
-
- if (peer_mnt)
- master = peer_mnt;
-
- if (master) {
- list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
- slave_mnt->mnt_master = master;
- list_move(&mnt->mnt_slave, &master->mnt_slave_list);
- list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
+ if (list_empty(&mnt->mnt_share)) {
+ if (IS_MNT_SHARED(mnt)) {
+ mnt_release_group_id(mnt);
+ CLEAR_MNT_SHARED(mnt);
+ }
+ master = mnt->mnt_master;
+ if (!master) {
+ struct list_head *p = &mnt->mnt_slave_list;
+ while (!list_empty(p)) {
+ slave_mnt = list_first_entry(p,
+ struct mount, mnt_slave);
+ list_del_init(&slave_mnt->mnt_slave);
+ slave_mnt->mnt_master = NULL;
+ }
+ return 0;
+ }
} else {
- struct list_head *p = &mnt->mnt_slave_list;
- while (!list_empty(p)) {
- slave_mnt = list_first_entry(p,
- struct mount, mnt_slave);
- list_del_init(&slave_mnt->mnt_slave);
- slave_mnt->mnt_master = NULL;
+ struct mount *m;
+ /*
+ * slave 'mnt' to a peer mount that has the
+ * same root dentry. If none is available then
+ * slave it to anything that is available.
+ */
+ for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
+ if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
+ master = m;
+ break;
+ }
}
+ list_del_init(&mnt->mnt_share);
+ mnt->mnt_group_id = 0;
+ CLEAR_MNT_SHARED(mnt);
}
+ list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
+ slave_mnt->mnt_master = master;
+ list_move(&mnt->mnt_slave, &master->mnt_slave_list);
+ list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+ INIT_LIST_HEAD(&mnt->mnt_slave_list);
mnt->mnt_master = master;
- CLEAR_MNT_SHARED(mnt);
return 0;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 595522022aca..c9d48dc78495 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
int error;
if (type == ACL_TYPE_ACCESS) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
- return 0;
- if (error == 0)
- acl = NULL;
+ error = posix_acl_update_mode(inode,
+ &inode->i_mode, &acl);
+ if (error)
+ return error;
}
inode->i_ctime = current_time(inode);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 81818adb8e9e..51a4213afa2e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
if (sigismember(set, i+2)) x |= 2;
if (sigismember(set, i+3)) x |= 4;
if (sigismember(set, i+4)) x |= 8;
- seq_printf(m, "%x", x);
+ seq_putc(m, hex_asc[x]);
} while (i >= 4);
seq_putc(m, '\n');
@@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
+ seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
- seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
- seq_putc(m, '\n');
+ seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
+ seq_putc(m, '\n');
}
static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ca651ac00660..87c9a9aacda3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -47,7 +47,7 @@
* Overall revision about smaps.
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
@@ -104,9 +104,12 @@
* in /proc for a task before it execs a suid executable.
*/
+static u8 nlink_tid;
+static u8 nlink_tgid;
+
struct pid_entry {
const char *name;
- int len;
+ unsigned int len;
umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
@@ -139,13 +142,13 @@ struct pid_entry {
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
-static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
unsigned int n)
{
unsigned int i;
unsigned int count;
- count = 0;
+ count = 2;
for (i = 0; i < n; ++i) {
if (S_ISDIR(entries[i].mode))
++count;
@@ -1243,7 +1246,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
};
#ifdef CONFIG_AUDITSYSCALL
-#define TMPBUFLEN 21
+#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -1664,7 +1667,8 @@ const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb,
+ struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
@@ -1678,6 +1682,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t
/* Common stuff */
ei = PROC_I(inode);
+ inode->i_mode = mode;
inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_op = &proc_def_inode_operations;
@@ -1967,7 +1972,7 @@ out:
struct map_files_info {
fmode_t mode;
- unsigned long len;
+ unsigned int len;
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
@@ -2004,7 +2009,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+ ((mode & FMODE_READ ) ? S_IRUSR : 0) |
+ ((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
return -ENOENT;
@@ -2013,12 +2020,6 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- inode->i_mode = S_IFLNK;
-
- if (mode & FMODE_READ)
- inode->i_mode |= S_IRUSR;
- if (mode & FMODE_WRITE)
- inode->i_mode |= S_IWUSR;
d_set_d_op(dentry, &tid_map_files_dentry_operations);
d_add(dentry, inode);
@@ -2372,12 +2373,11 @@ static int proc_pident_instantiate(struct inode *dir,
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
if (!inode)
goto out;
ei = PROC_I(inode);
- inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
set_nlink(inode, 2); /* Use getattr to fix if necessary */
if (p->iop)
@@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents - 1];
- for (p = ents; p <= last; p++) {
+ last = &ents[nents];
+ for (p = ents; p < last; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len))
break;
}
- if (p > last)
+ if (p >= last)
goto out;
error = proc_pident_instantiate(dir, dentry, task, p);
@@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
if (ctx->pos >= nents + 2)
goto out;
- for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+ for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
if (!proc_fill_cache(file, ctx, p->name, p->len,
proc_pident_instantiate, task, p))
break;
@@ -3059,17 +3059,15 @@ static int proc_pid_instantiate(struct inode *dir,
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
goto out;
- inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
- ARRAY_SIZE(tgid_base_stuff)));
+ set_nlink(inode, nlink_tgid);
d_set_d_op(dentry, &pid_dentry_operations);
@@ -3181,6 +3179,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
iter.tgid += 1, iter = next_tgid(ns, iter)) {
char name[PROC_NUMBUF];
int len;
+
+ cond_resched();
if (!has_pid_permissions(ns, iter.task, 2))
continue;
@@ -3352,17 +3352,15 @@ static int proc_task_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
goto out;
- inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
- ARRAY_SIZE(tid_base_stuff)));
+ set_nlink(inode, nlink_tid);
d_set_d_op(dentry, &pid_dentry_operations);
@@ -3552,3 +3550,9 @@ static const struct file_operations proc_task_operations = {
.iterate_shared = proc_task_readdir,
.llseek = generic_file_llseek,
};
+
+void __init set_proc_pid_nlink(void)
+{
+ nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d21dafef3102..4274f83bf100 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -183,14 +183,13 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
if (!inode)
goto out;
ei = PROC_I(inode);
ei->fd = fd;
- inode->i_mode = S_IFLNK;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
@@ -322,14 +321,13 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
if (!inode)
goto out;
ei = PROC_I(inode);
ei->fd = fd;
- inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5f2dc2032c79..f6a01f09f79d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -22,7 +22,7 @@
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -479,6 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
}
return ent;
}
+EXPORT_SYMBOL(proc_create_mount_point);
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e69ebe648a34..842a5ff5b85c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,7 @@
#include <linux/mount.h>
#include <linux/magic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
/* pde is locked */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
+ /*
+ * close() (proc_reg_release()) can't delete an entry and proceed:
+ * ->release hook needs to be available at the right moment.
+ *
+ * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
+ * "struct file" needs to be available at the right moment.
+ *
+ * Therefore, first process to enter this function does ->release() and
+ * signals its completion to the other process which does nothing.
+ */
if (pdeo->closing) {
/* somebody else is doing that, just wait */
DECLARE_COMPLETION_ONSTACK(c);
@@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
- pdeo->closing = 1;
+ pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
pde->proc_fops->release(file_inode(file), file);
spin_lock(&pde->pde_unload_lock);
- list_del_init(&pdeo->lh);
+ /* After ->release. */
+ list_del(&pdeo->lh);
if (pdeo->c)
complete(pdeo->c);
kfree(pdeo);
@@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
if (atomic_add_return(BIAS, &de->in_use) != BIAS)
wait_for_completion(&c);
+ /* ->pde_openers list can't grow from now on. */
+
spin_lock(&de->pde_unload_lock);
while (!list_empty(&de->pde_openers)) {
struct pde_opener *pdeo;
@@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
/*
- * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
- * sequence. ->release won't be called because ->proc_fops will be
- * cleared. Depending on complexity of ->release, consequences vary.
+ * Ensure that
+ * 1) PDE's ->release hook will be called no matter what
+ * either normally by close()/->release, or forcefully by
+ * rmmod/remove_proc_entry.
+ *
+ * 2) rmmod isn't blocked by opening file in /proc and sitting on
+ * the descriptor (including "rmmod foo </proc/foo" scenario).
*
- * We can't wait for mercy when close will be done for real, it's
- * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
- * by hand in remove_proc_entry(). For this, save opener's credentials
- * for later.
+ * Save every "struct file" with custom ->release hook.
*/
- pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
+ pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
@@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (rv == 0 && release) {
/* To know what to release. */
pdeo->file = file;
- /* Strictly for "too late" ->release in proc_reg_release(). */
+ pdeo->closing = false;
+ pdeo->c = NULL;
spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
spin_unlock(&pde->pde_unload_lock);
@@ -410,7 +425,6 @@ static const char *proc_get_link(struct dentry *dentry,
}
const struct inode_operations proc_link_inode_operations = {
- .readlink = generic_readlink,
.get_link = proc_get_link,
};
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5378441ec1b7..2de5194ba378 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,7 +162,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int proc_setattr(struct dentry *, struct iattr *);
-extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern int pid_revalidate(struct dentry *, unsigned int);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
@@ -195,7 +195,6 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
return S_ISDIR(pde->mode) && !pde->proc_iops;
}
-struct proc_dir_entry *proc_create_mount_point(const char *name);
/*
* inode.c
@@ -203,7 +202,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
struct pde_opener {
struct file *file;
struct list_head lh;
- int closing;
+ bool closing;
struct completion *c;
};
extern const struct inode_operations proc_link_inode_operations;
@@ -211,6 +210,7 @@ extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
+void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5c89a07e3d7f..0b80ad87b4d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,7 @@
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 05f8dcdb086e..f9387bb7631b 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -14,7 +14,7 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
extern wait_queue_head_t log_wait;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 51b8b0a8ad91..766f0c637ad1 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -92,12 +92,11 @@ static int proc_ns_instantiate(struct inode *dir,
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
if (!inode)
goto out;
ei = PROC_I(inode);
- inode->i_mode = S_IFLNK|S_IRWXUGO;
inode->i_op = &proc_ns_link_inode_operations;
ei->ns_ops = ns_ops;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f8595e8b5cd0..75634379f82e 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/div64.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3ecd445e830d..2726536489b1 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -13,7 +13,7 @@
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/kernel-page-flags.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
@@ -173,7 +173,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache);
+ if (PageSwapCache(page))
+ u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7ae6b1da7cab..ffd72a6c6e04 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,7 +8,7 @@
* proc net directory handling functions
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 55313d994895..d4e37acd4821 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -709,7 +709,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
ctl_dir = container_of(head, struct ctl_dir, header);
if (!dir_emit_dots(file, ctx))
- return 0;
+ goto out;
pos = 2;
@@ -719,6 +719,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
break;
}
}
+out:
sysctl_head_finish(head);
return 0;
}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 15f327bed8c6..901bd06f437d 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,7 +4,7 @@
* Copyright 1997, Theodore Ts'o
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8d3e484055a6..1988440b2049 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,7 +6,7 @@
* proc root directory handling functions
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
@@ -122,6 +122,7 @@ void __init proc_root_init(void)
int err;
proc_init_inodecache();
+ set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type);
if (err)
return;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 40245954c450..39857f6db5cf 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -6,18 +6,6 @@
/*
* /proc/self:
*/
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
-{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF];
- if (!tgid)
- return -ENOENT;
- sprintf(tmp, "%d", tgid);
- return readlink_copy(buffer, buflen, tmp);
-}
-
static const char *proc_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
@@ -38,7 +26,6 @@ static const char *proc_self_get_link(struct dentry *dentry,
}
static const struct inode_operations proc_self_inode_operations = {
- .readlink = proc_self_readlink,
.get_link = proc_self_get_link,
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35b92d81692f..8f96a49178d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
#include <linux/shmem_fs.h>
#include <asm/elf.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 595b90a9766c..20614b62a9b7 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -6,19 +6,6 @@
/*
* /proc/thread_self:
*/
-static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
-{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- pid_t pid = task_pid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
- if (!pid)
- return -ENOENT;
- sprintf(tmp, "%d/task/%d", tgid, pid);
- return readlink_copy(buffer, buflen, tmp);
-}
-
static const char *proc_thread_self_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
@@ -40,7 +27,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
}
static const struct inode_operations proc_thread_self_inode_operations = {
- .readlink = proc_thread_self_readlink,
.get_link = proc_thread_self_get_link,
};
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8ab782d8b33d..5105b1599981 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -22,7 +22,7 @@
#include <linux/list.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include "internal.h"
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index be40813eff52..b42e5bd6d8ff 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -86,4 +86,4 @@ config PSTORE_RAM
Note that for historical reasons, the module will be named
"ramoops.ko".
- For more information, see Documentation/ramoops.txt.
+ For more information, see Documentation/admin-guide/ramoops.rst.
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index d4887705bb61..899d0ba0bd6c 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -27,6 +27,9 @@
#include <asm/barrier.h>
#include "internal.h"
+/* This doesn't need to be atomic: speed is chosen over correctness here. */
+static u64 pstore_ftrace_stamp;
+
static void notrace pstore_ftrace_call(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *op,
@@ -42,6 +45,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.ip = ip;
rec.parent_ip = parent_ip;
+ pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
0, sizeof(rec), psinfo);
@@ -71,10 +75,13 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
if (!on ^ pstore_ftrace_enabled)
goto out;
- if (on)
+ if (on) {
+ ftrace_ops_set_global_filter(&pstore_ftrace_ops);
ret = register_ftrace_function(&pstore_ftrace_ops);
- else
+ } else {
ret = unregister_ftrace_function(&pstore_ftrace_ops);
+ }
+
if (ret) {
pr_err("%s: unable to %sregister ftrace ops: %zd\n",
__func__, on ? "" : "un", ret);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1781dc50762e..57c0646479f5 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -107,9 +107,11 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
- seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
- pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
- (void *)rec->ip, (void *)rec->parent_ip);
+ seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
+ pstore_ftrace_decode_cpu(rec),
+ pstore_ftrace_read_timestamp(rec),
+ rec->ip, rec->parent_ip, (void *)rec->ip,
+ (void *)rec->parent_ip);
return 0;
}
@@ -197,11 +199,14 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
if (err)
return err;
- if (p->psi->erase)
+ if (p->psi->erase) {
+ mutex_lock(&p->psi->read_mutex);
p->psi->erase(p->type, p->id, p->count,
d_inode(dentry)->i_ctime, p->psi);
- else
+ mutex_unlock(&p->psi->read_mutex);
+ } else {
return -EPERM;
+ }
return simple_unlink(dir, dentry);
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index e38a22b31282..da416e6591c9 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -5,40 +5,6 @@
#include <linux/time.h>
#include <linux/pstore.h>
-#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
-#define PSTORE_CPU_IN_IP 0x1
-#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
-#define PSTORE_CPU_IN_IP 0x3
-#endif
-
-struct pstore_ftrace_record {
- unsigned long ip;
- unsigned long parent_ip;
-#ifndef PSTORE_CPU_IN_IP
- unsigned int cpu;
-#endif
-};
-
-static inline void
-pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
-{
-#ifndef PSTORE_CPU_IN_IP
- rec->cpu = cpu;
-#else
- rec->ip |= cpu;
-#endif
-}
-
-static inline unsigned int
-pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
-{
-#ifndef PSTORE_CPU_IN_IP
- return rec->cpu;
-#else
- return rec->ip & PSTORE_CPU_IN_IP;
-#endif
-}
-
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
extern void pstore_unregister_ftrace(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 14984d902a99..729677e18e36 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -493,6 +493,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (!is_locked) {
pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
, in_nmi() ? "NMI" : why);
+ return;
}
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
@@ -584,8 +585,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
- memcpy(psinfo->buf, s, c);
- psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
+ psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
+ s, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6ad831b9d1b8..1d887efaaf71 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -85,10 +85,10 @@ MODULE_PARM_DESC(ramoops_ecc,
"bytes ECC)");
struct ramoops_context {
- struct persistent_ram_zone **przs;
- struct persistent_ram_zone *cprz;
- struct persistent_ram_zone *fprz;
- struct persistent_ram_zone *mprz;
+ struct persistent_ram_zone **dprzs; /* Oops dump zones */
+ struct persistent_ram_zone *cprz; /* Console zone */
+ struct persistent_ram_zone **fprzs; /* Ftrace zones */
+ struct persistent_ram_zone *mprz; /* PMSG zone */
phys_addr_t phys_addr;
unsigned long size;
unsigned int memtype;
@@ -97,12 +97,14 @@ struct ramoops_context {
size_t ftrace_size;
size_t pmsg_size;
int dump_oops;
+ u32 flags;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
/* _read_cnt need clear on ramoops_pstore_open */
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
+ unsigned int max_ftrace_cnt;
unsigned int ftrace_read_cnt;
unsigned int pmsg_read_cnt;
struct pstore_info pstore;
@@ -180,16 +182,69 @@ static bool prz_ok(struct persistent_ram_zone *prz)
persistent_ram_ecc_string(prz, NULL, 0));
}
+static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
+ struct persistent_ram_zone *src)
+{
+ size_t dest_size, src_size, total, dest_off, src_off;
+ size_t dest_idx = 0, src_idx = 0, merged_idx = 0;
+ void *merged_buf;
+ struct pstore_ftrace_record *drec, *srec, *mrec;
+ size_t record_size = sizeof(struct pstore_ftrace_record);
+
+ dest_off = dest->old_log_size % record_size;
+ dest_size = dest->old_log_size - dest_off;
+
+ src_off = src->old_log_size % record_size;
+ src_size = src->old_log_size - src_off;
+
+ total = dest_size + src_size;
+ merged_buf = kmalloc(total, GFP_KERNEL);
+ if (!merged_buf)
+ return -ENOMEM;
+
+ drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off);
+ srec = (struct pstore_ftrace_record *)(src->old_log + src_off);
+ mrec = (struct pstore_ftrace_record *)(merged_buf);
+
+ while (dest_size > 0 && src_size > 0) {
+ if (pstore_ftrace_read_timestamp(&drec[dest_idx]) <
+ pstore_ftrace_read_timestamp(&srec[src_idx])) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ } else {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+ }
+
+ while (dest_size > 0) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ }
+
+ while (src_size > 0) {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+
+ kfree(dest->old_log);
+ dest->old_log = merged_buf;
+ dest->old_log_size = total;
+
+ return 0;
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
ssize_t *ecc_notice_size,
struct pstore_info *psi)
{
- ssize_t size;
+ ssize_t size = 0;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz = NULL;
int header_length = 0;
+ bool free_prz = false;
/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
* PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
@@ -201,7 +256,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
- prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+ prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
cxt->max_dump_cnt, id, type,
PSTORE_TYPE_DMESG, 1);
if (!prz_ok(prz))
@@ -219,14 +274,56 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
- if (!prz_ok(prz))
- prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
- 1, id, type, PSTORE_TYPE_FTRACE, 0);
+
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
1, id, type, PSTORE_TYPE_PMSG, 0);
- if (!prz_ok(prz))
- return 0;
+
+ /* ftrace is last since it may want to dynamically allocate memory. */
+ if (!prz_ok(prz) && cxt->fprzs) {
+ if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
+ prz = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt, 1, id, type,
+ PSTORE_TYPE_FTRACE, 0);
+ } else {
+ /*
+ * Build a new dummy record which combines all the
+ * per-cpu records including metadata and ecc info.
+ */
+ struct persistent_ram_zone *tmp_prz, *prz_next;
+
+ tmp_prz = kzalloc(sizeof(struct persistent_ram_zone),
+ GFP_KERNEL);
+ if (!tmp_prz)
+ return -ENOMEM;
+ free_prz = true;
+
+ while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
+ prz_next = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt,
+ cxt->max_ftrace_cnt, id,
+ type, PSTORE_TYPE_FTRACE, 0);
+
+ if (!prz_ok(prz_next))
+ continue;
+
+ tmp_prz->ecc_info = prz_next->ecc_info;
+ tmp_prz->corrected_bytes +=
+ prz_next->corrected_bytes;
+ tmp_prz->bad_blocks += prz_next->bad_blocks;
+ size = ftrace_log_combine(tmp_prz, prz_next);
+ if (size)
+ goto out;
+ }
+ *id = 0;
+ prz = tmp_prz;
+ }
+ }
+
+ if (!prz_ok(prz)) {
+ size = 0;
+ goto out;
+ }
size = persistent_ram_old_size(prz) - header_length;
@@ -234,12 +331,21 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
*ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
*buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
- if (*buf == NULL)
- return -ENOMEM;
+ if (*buf == NULL) {
+ size = -ENOMEM;
+ goto out;
+ }
memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+
persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+out:
+ if (free_prz) {
+ kfree(prz->old_log);
+ kfree(prz);
+ }
+
return size;
}
@@ -283,15 +389,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
persistent_ram_write(cxt->cprz, buf, size);
return 0;
} else if (type == PSTORE_TYPE_FTRACE) {
- if (!cxt->fprz)
+ int zonenum;
+
+ if (!cxt->fprzs)
return -ENOMEM;
- persistent_ram_write(cxt->fprz, buf, size);
+ /*
+ * Choose zone by if we're using per-cpu buffers.
+ */
+ if (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ zonenum = smp_processor_id();
+ else
+ zonenum = 0;
+
+ persistent_ram_write(cxt->fprzs[zonenum], buf, size);
return 0;
} else if (type == PSTORE_TYPE_PMSG) {
- if (!cxt->mprz)
- return -ENOMEM;
- persistent_ram_write(cxt->mprz, buf, size);
- return 0;
+ pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
+ return -EINVAL;
}
if (type != PSTORE_TYPE_DMESG)
@@ -316,10 +430,10 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
if (part != 1)
return -ENOSPC;
- if (!cxt->przs)
+ if (!cxt->dprzs)
return -ENOSPC;
- prz = cxt->przs[cxt->dump_write_cnt];
+ prz = cxt->dprzs[cxt->dump_write_cnt];
hlen = ramoops_write_kmsg_hdr(prz, compressed);
if (size + hlen > prz->buffer_size)
@@ -359,13 +473,15 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
case PSTORE_TYPE_DMESG:
if (id >= cxt->max_dump_cnt)
return -EINVAL;
- prz = cxt->przs[id];
+ prz = cxt->dprzs[id];
break;
case PSTORE_TYPE_CONSOLE:
prz = cxt->cprz;
break;
case PSTORE_TYPE_FTRACE:
- prz = cxt->fprz;
+ if (id >= cxt->max_ftrace_cnt)
+ return -EINVAL;
+ prz = cxt->fprzs[id];
break;
case PSTORE_TYPE_PMSG:
prz = cxt->mprz;
@@ -396,68 +512,113 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
- if (!cxt->przs)
- return;
+ /* Free dump PRZs */
+ if (cxt->dprzs) {
+ for (i = 0; i < cxt->max_dump_cnt; i++)
+ persistent_ram_free(cxt->dprzs[i]);
- for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->przs[i]);
+ kfree(cxt->dprzs);
+ cxt->max_dump_cnt = 0;
+ }
- kfree(cxt->przs);
- cxt->max_dump_cnt = 0;
+ /* Free ftrace PRZs */
+ if (cxt->fprzs) {
+ for (i = 0; i < cxt->max_ftrace_cnt; i++)
+ persistent_ram_free(cxt->fprzs[i]);
+ kfree(cxt->fprzs);
+ cxt->max_ftrace_cnt = 0;
+ }
}
-static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
- phys_addr_t *paddr, size_t dump_mem_sz)
+static int ramoops_init_przs(const char *name,
+ struct device *dev, struct ramoops_context *cxt,
+ struct persistent_ram_zone ***przs,
+ phys_addr_t *paddr, size_t mem_sz,
+ ssize_t record_size,
+ unsigned int *cnt, u32 sig, u32 flags)
{
int err = -ENOMEM;
int i;
+ size_t zone_sz;
+ struct persistent_ram_zone **prz_ar;
- if (!cxt->record_size)
+ /* Allocate nothing for 0 mem_sz or 0 record_size. */
+ if (mem_sz == 0 || record_size == 0) {
+ *cnt = 0;
return 0;
+ }
- if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
- dev_err(dev, "no room for dumps\n");
- return -ENOMEM;
+ /*
+ * If we have a negative record size, calculate it based on
+ * mem_sz / *cnt. If we have a positive record size, calculate
+ * cnt from mem_sz / record_size.
+ */
+ if (record_size < 0) {
+ if (*cnt == 0)
+ return 0;
+ record_size = mem_sz / *cnt;
+ if (record_size == 0) {
+ dev_err(dev, "%s record size == 0 (%zu / %u)\n",
+ name, mem_sz, *cnt);
+ goto fail;
+ }
+ } else {
+ *cnt = mem_sz / record_size;
+ if (*cnt == 0) {
+ dev_err(dev, "%s record count == 0 (%zu / %zu)\n",
+ name, mem_sz, record_size);
+ goto fail;
+ }
}
- cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
- if (!cxt->max_dump_cnt)
- return -ENOMEM;
+ if (*paddr + mem_sz - cxt->phys_addr > cxt->size) {
+ dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+ name,
+ mem_sz, (unsigned long long)*paddr,
+ cxt->size, (unsigned long long)cxt->phys_addr);
+ goto fail;
+ }
- cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
- GFP_KERNEL);
- if (!cxt->przs) {
- dev_err(dev, "failed to initialize a prz array for dumps\n");
- goto fail_mem;
+ zone_sz = mem_sz / *cnt;
+ if (!zone_sz) {
+ dev_err(dev, "%s zone size == 0\n", name);
+ goto fail;
}
- for (i = 0; i < cxt->max_dump_cnt; i++) {
- cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
+ prz_ar = kcalloc(*cnt, sizeof(**przs), GFP_KERNEL);
+ if (!prz_ar)
+ goto fail;
+
+ for (i = 0; i < *cnt; i++) {
+ prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
- cxt->memtype);
- if (IS_ERR(cxt->przs[i])) {
- err = PTR_ERR(cxt->przs[i]);
- dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
- cxt->record_size, (unsigned long long)*paddr, err);
+ cxt->memtype, flags);
+ if (IS_ERR(prz_ar[i])) {
+ err = PTR_ERR(prz_ar[i]);
+ dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+ name, record_size,
+ (unsigned long long)*paddr, err);
while (i > 0) {
i--;
- persistent_ram_free(cxt->przs[i]);
+ persistent_ram_free(prz_ar[i]);
}
- goto fail_prz;
+ kfree(prz_ar);
+ goto fail;
}
- *paddr += cxt->record_size;
+ *paddr += zone_sz;
}
+ *przs = prz_ar;
return 0;
-fail_prz:
- kfree(cxt->przs);
-fail_mem:
- cxt->max_dump_cnt = 0;
+
+fail:
+ *cnt = 0;
return err;
}
-static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+static int ramoops_init_prz(const char *name,
+ struct device *dev, struct ramoops_context *cxt,
struct persistent_ram_zone **prz,
phys_addr_t *paddr, size_t sz, u32 sig)
{
@@ -465,18 +626,19 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return 0;
if (*paddr + sz - cxt->phys_addr > cxt->size) {
- dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
- sz, (unsigned long long)*paddr,
+ dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+ name, sz, (unsigned long long)*paddr,
cxt->size, (unsigned long long)cxt->phys_addr);
return -ENOMEM;
}
- *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
+ *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
+ cxt->memtype, 0);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
- dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
- sz, (unsigned long long)*paddr, err);
+ dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+ name, sz, (unsigned long long)*paddr, err);
return err;
}
@@ -543,6 +705,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
parse_size("ftrace-size", pdata->ftrace_size);
parse_size("pmsg-size", pdata->pmsg_size);
parse_size("ecc-size", pdata->ecc_info.ecc_size);
+ parse_size("flags", pdata->flags);
#undef parse_size
@@ -561,6 +724,7 @@ static int ramoops_probe(struct platform_device *pdev)
if (dev_of_node(dev) && !pdata) {
pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata) {
+ pr_err("cannot allocate platform data buffer\n");
err = -ENOMEM;
goto fail_out;
}
@@ -570,11 +734,20 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
- /* Only a single ramoops area allowed at a time, so fail extra
+ /*
+ * Only a single ramoops area allowed at a time, so fail extra
* probes.
*/
- if (cxt->max_dump_cnt)
+ if (cxt->max_dump_cnt) {
+ pr_err("already initialized\n");
goto fail_out;
+ }
+
+ /* Make sure we didn't get bogus platform data pointer. */
+ if (!pdata) {
+ pr_err("NULL platform data\n");
+ goto fail_out;
+ }
if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
!pdata->ftrace_size && !pdata->pmsg_size)) {
@@ -600,27 +773,37 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->ftrace_size = pdata->ftrace_size;
cxt->pmsg_size = pdata->pmsg_size;
cxt->dump_oops = pdata->dump_oops;
+ cxt->flags = pdata->flags;
cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
- cxt->pmsg_size;
- err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
+ err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
+ dump_mem_sz, cxt->record_size,
+ &cxt->max_dump_cnt, 0, 0);
if (err)
goto fail_out;
- err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr,
+ err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
cxt->console_size, 0);
if (err)
goto fail_init_cprz;
- err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
- LINUX_VERSION_CODE);
+ cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ ? nr_cpu_ids
+ : 1;
+ err = ramoops_init_przs("ftrace", dev, cxt, &cxt->fprzs, &paddr,
+ cxt->ftrace_size, -1,
+ &cxt->max_ftrace_cnt, LINUX_VERSION_CODE,
+ (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ ? PRZ_FLAG_NO_LOCK : 0);
if (err)
goto fail_init_fprz;
- err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+ err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+ cxt->pmsg_size, 0);
if (err)
goto fail_init_mprz;
@@ -680,7 +863,6 @@ fail_clear:
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
fail_init_mprz:
- persistent_ram_free(cxt->fprz);
fail_init_fprz:
persistent_ram_free(cxt->cprz);
fail_init_cprz:
@@ -699,7 +881,6 @@ static int ramoops_remove(struct platform_device *pdev)
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->fprz);
persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
@@ -741,6 +922,8 @@ static void ramoops_register_dummy(void)
dummy_data->ftrace_size = ramoops_ftrace_size;
dummy_data->pmsg_size = ramoops_pmsg_size;
dummy_data->dump_oops = dump_oops;
+ dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 3975deec02f8..a857338b7dab 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
return atomic_read(&prz->buffer->start);
}
-static DEFINE_RAW_SPINLOCK(buffer_lock);
-
/* increase and wrap the start pointer, returning the old value */
static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
{
int old;
int new;
- unsigned long flags;
+ unsigned long flags = 0;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->start);
new = old + a;
@@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
new -= prz->buffer_size;
atomic_set(&prz->buffer->start, new);
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
return old;
}
@@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
{
size_t old;
size_t new;
- unsigned long flags;
+ unsigned long flags = 0;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->size);
if (old == prz->buffer_size)
@@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
atomic_set(&prz->buffer->size, new);
exit:
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
}
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
@@ -465,7 +467,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
}
static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
- struct persistent_ram_ecc_info *ecc_info)
+ struct persistent_ram_ecc_info *ecc_info,
+ unsigned long flags)
{
int ret;
@@ -493,6 +496,8 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
prz->buffer->sig = sig;
persistent_ram_zap(prz);
+ prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+ prz->flags = flags;
return 0;
}
@@ -517,7 +522,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
u32 sig, struct persistent_ram_ecc_info *ecc_info,
- unsigned int memtype)
+ unsigned int memtype, u32 flags)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -532,7 +537,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
if (ret)
goto err;
- ret = persistent_ram_post_init(prz, sig, ecc_info);
+ ret = persistent_ram_post_init(prz, sig, ecc_info, flags);
if (ret)
goto err;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 1bfac28b7e7d..406fed92362a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -119,8 +119,7 @@
* spinlock to internal buffers before writing.
*
* Lock ordering (including related VFS locks) is the following:
- * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
- * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
+ * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
@@ -572,7 +571,8 @@ int dquot_scan_active(struct super_block *sb,
struct dquot *dquot, *old_dquot = NULL;
int ret = 0;
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));
+
spin_lock(&dq_list_lock);
list_for_each_entry(dquot, &inuse_list, dq_inuse) {
if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
@@ -603,7 +603,6 @@ int dquot_scan_active(struct super_block *sb,
spin_unlock(&dq_list_lock);
out:
dqput(old_dquot);
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return ret;
}
EXPORT_SYMBOL(dquot_scan_active);
@@ -617,7 +616,8 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
int cnt;
int err, ret = 0;
- mutex_lock(&dqopt->dqonoff_mutex);
+ WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
@@ -653,7 +653,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
&& info_dirty(&dqopt->info[cnt]))
sb->dq_op->write_info(sb, cnt);
dqstats_inc(DQST_SYNCS);
- mutex_unlock(&dqopt->dqonoff_mutex);
return ret;
}
@@ -683,7 +682,6 @@ int dquot_quota_sync(struct super_block *sb, int type)
* Now when everything is written we can discard the pagecache so
* that userspace sees the changes.
*/
- mutex_lock(&dqopt->dqonoff_mutex);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
@@ -693,7 +691,6 @@ int dquot_quota_sync(struct super_block *sb, int type)
truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
inode_unlock(dqopt->files[cnt]);
}
- mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
@@ -935,7 +932,7 @@ static int dqinit_needed(struct inode *inode, int type)
return 0;
}
-/* This routine is guarded by dqonoff_mutex mutex */
+/* This routine is guarded by s_umount semaphore */
static void add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
@@ -2050,21 +2047,13 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
struct quota_info *dqopt = sb_dqopt(sb);
int err;
- mutex_lock(&dqopt->dqonoff_mutex);
- if (!sb_has_quota_active(sb, qid->type)) {
- err = -ESRCH;
- goto out;
- }
- if (!dqopt->ops[qid->type]->get_next_id) {
- err = -ENOSYS;
- goto out;
- }
+ if (!sb_has_quota_active(sb, qid->type))
+ return -ESRCH;
+ if (!dqopt->ops[qid->type]->get_next_id)
+ return -ENOSYS;
mutex_lock(&dqopt->dqio_mutex);
err = dqopt->ops[qid->type]->get_next_id(sb, qid);
mutex_unlock(&dqopt->dqio_mutex);
-out:
- mutex_unlock(&dqopt->dqonoff_mutex);
-
return err;
}
EXPORT_SYMBOL(dquot_get_next_id);
@@ -2107,6 +2096,10 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *toputinode[MAXQUOTAS];
+ /* s_umount should be held in exclusive mode */
+ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+ up_read(&sb->s_umount);
+
/* Cannot turn off usage accounting without turning off limits, or
* suspend quotas and simultaneously turn quotas off. */
if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
@@ -2114,18 +2107,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
DQUOT_USAGE_ENABLED)))
return -EINVAL;
- /* We need to serialize quota_off() for device */
- mutex_lock(&dqopt->dqonoff_mutex);
-
/*
* Skip everything if there's nothing to do. We have to do this because
* sometimes we are called when fill_super() failed and calling
* sync_fs() in such cases does no good.
*/
- if (!sb_any_quota_loaded(sb)) {
- mutex_unlock(&dqopt->dqonoff_mutex);
+ if (!sb_any_quota_loaded(sb))
return 0;
- }
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
toputinode[cnt] = NULL;
if (type != -1 && cnt != type)
@@ -2179,7 +2168,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
dqopt->info[cnt].dqi_bgrace = 0;
dqopt->ops[cnt] = NULL;
}
- mutex_unlock(&dqopt->dqonoff_mutex);
/* Skip syncing and setting flags if quota files are hidden */
if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
@@ -2196,20 +2184,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
* must also discard the blockdev buffers so that we see the
* changes done by userspace on the next quotaon() */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (toputinode[cnt]) {
- mutex_lock(&dqopt->dqonoff_mutex);
- /* If quota was reenabled in the meantime, we have
- * nothing to do */
- if (!sb_has_quota_loaded(sb, cnt)) {
- inode_lock(toputinode[cnt]);
- toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
+ /* This can happen when suspending quotas on remount-ro... */
+ if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
+ inode_lock(toputinode[cnt]);
+ toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
- truncate_inode_pages(&toputinode[cnt]->i_data,
- 0);
- inode_unlock(toputinode[cnt]);
- mark_inode_dirty_sync(toputinode[cnt]);
- }
- mutex_unlock(&dqopt->dqonoff_mutex);
+ truncate_inode_pages(&toputinode[cnt]->i_data, 0);
+ inode_unlock(toputinode[cnt]);
+ mark_inode_dirty_sync(toputinode[cnt]);
}
if (sb->s_bdev)
invalidate_bdev(sb->s_bdev);
@@ -2281,6 +2263,10 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = -EINVAL;
goto out_fmt;
}
+ if (sb_has_quota_loaded(sb, type)) {
+ error = -EBUSY;
+ goto out_fmt;
+ }
if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
/* As we bypass the pagecache we must now flush all the
@@ -2292,11 +2278,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
sync_filesystem(sb);
invalidate_bdev(sb->s_bdev);
}
- mutex_lock(&dqopt->dqonoff_mutex);
- if (sb_has_quota_loaded(sb, type)) {
- error = -EBUSY;
- goto out_lock;
- }
if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
/* We don't want quota and atime on quota files (deadlocks
@@ -2317,7 +2298,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = -EIO;
dqopt->files[type] = igrab(inode);
if (!dqopt->files[type])
- goto out_lock;
+ goto out_file_flags;
error = -EINVAL;
if (!fmt->qf_ops->check_quota_file(sb, type))
goto out_file_init;
@@ -2340,14 +2321,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
spin_unlock(&dq_state_lock);
add_dquot_ref(sb, type);
- mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
out_file_init:
dqopt->files[type] = NULL;
iput(inode);
-out_lock:
+out_file_flags:
if (oldflags != -1) {
inode_lock(inode);
/* Set the flags back (in the case of accidental quotaon()
@@ -2356,7 +2336,6 @@ out_lock:
inode->i_flags |= oldflags;
inode_unlock(inode);
}
- mutex_unlock(&dqopt->dqonoff_mutex);
out_fmt:
put_quota_format(fmt);
@@ -2371,15 +2350,16 @@ int dquot_resume(struct super_block *sb, int type)
int ret = 0, cnt;
unsigned int flags;
+ /* s_umount should be held in exclusive mode */
+ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+ up_read(&sb->s_umount);
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
-
- mutex_lock(&dqopt->dqonoff_mutex);
- if (!sb_has_quota_suspended(sb, cnt)) {
- mutex_unlock(&dqopt->dqonoff_mutex);
+ if (!sb_has_quota_suspended(sb, cnt))
continue;
- }
+
inode = dqopt->files[cnt];
dqopt->files[cnt] = NULL;
spin_lock(&dq_state_lock);
@@ -2388,7 +2368,6 @@ int dquot_resume(struct super_block *sb, int type)
cnt);
dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
spin_unlock(&dq_state_lock);
- mutex_unlock(&dqopt->dqonoff_mutex);
flags = dquot_generic_flag(flags, cnt);
ret = vfs_load_quota_inode(inode, cnt,
@@ -2401,7 +2380,7 @@ int dquot_resume(struct super_block *sb, int type)
EXPORT_SYMBOL(dquot_resume);
int dquot_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path)
+ const struct path *path)
{
int error = security_quota_on(path->dentry);
if (error)
@@ -2424,42 +2403,30 @@ EXPORT_SYMBOL(dquot_quota_on);
int dquot_enable(struct inode *inode, int type, int format_id,
unsigned int flags)
{
- int ret = 0;
struct super_block *sb = inode->i_sb;
- struct quota_info *dqopt = sb_dqopt(sb);
/* Just unsuspend quotas? */
BUG_ON(flags & DQUOT_SUSPENDED);
+ /* s_umount should be held in exclusive mode */
+ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+ up_read(&sb->s_umount);
if (!flags)
return 0;
/* Just updating flags needed? */
if (sb_has_quota_loaded(sb, type)) {
- mutex_lock(&dqopt->dqonoff_mutex);
- /* Now do a reliable test... */
- if (!sb_has_quota_loaded(sb, type)) {
- mutex_unlock(&dqopt->dqonoff_mutex);
- goto load_quota;
- }
if (flags & DQUOT_USAGE_ENABLED &&
- sb_has_quota_usage_enabled(sb, type)) {
- ret = -EBUSY;
- goto out_lock;
- }
+ sb_has_quota_usage_enabled(sb, type))
+ return -EBUSY;
if (flags & DQUOT_LIMITS_ENABLED &&
- sb_has_quota_limits_enabled(sb, type)) {
- ret = -EBUSY;
- goto out_lock;
- }
+ sb_has_quota_limits_enabled(sb, type))
+ return -EBUSY;
spin_lock(&dq_state_lock);
sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
spin_unlock(&dq_state_lock);
-out_lock:
- mutex_unlock(&dqopt->dqonoff_mutex);
- return ret;
+ return 0;
}
-load_quota:
return vfs_load_quota_inode(inode, type, format_id, flags);
}
EXPORT_SYMBOL(dquot_enable);
@@ -2751,7 +2718,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
struct quota_info *dqopt = sb_dqopt(sb);
int type;
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
memset(state, 0, sizeof(*state));
for (type = 0; type < MAXQUOTAS; type++) {
if (!sb_has_quota_active(sb, type))
@@ -2773,7 +2739,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
tstate->nextents = 1; /* We don't know... */
spin_unlock(&dq_data_lock);
}
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return 0;
}
EXPORT_SYMBOL(dquot_get_state);
@@ -2787,18 +2752,13 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
if ((ii->i_fieldmask & QC_WARNS_MASK) ||
(ii->i_fieldmask & QC_RT_SPC_TIMER))
return -EINVAL;
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_active(sb, type)) {
- err = -ESRCH;
- goto out;
- }
+ if (!sb_has_quota_active(sb, type))
+ return -ESRCH;
mi = sb_dqopt(sb)->info + type;
if (ii->i_fieldmask & QC_FLAGS) {
if ((ii->i_flags & QCI_ROOT_SQUASH &&
- mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
- err = -EINVAL;
- goto out;
- }
+ mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
+ return -EINVAL;
}
spin_lock(&dq_data_lock);
if (ii->i_fieldmask & QC_SPC_TIMER)
@@ -2815,8 +2775,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
mark_info_dirty(sb, type);
/* Force write to disk */
sb->dq_op->write_info(sb, type);
-out:
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return err;
}
EXPORT_SYMBOL(dquot_set_dqinfo);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 8b252673d454..e99b1a72d9a7 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -12,14 +12,8 @@ static const struct genl_multicast_group quota_mcgrps[] = {
};
/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- /*
- * Needed due to multicast group ID abuse - old code assumed
- * the family ID was also a valid multicast group ID (which
- * isn't true) and userspace might thus rely on it. Assign a
- * static ID for this group to make dealing with that easier.
- */
- .id = GENL_ID_VFS_DQUOT,
+static struct genl_family quota_genl_family __ro_after_init = {
+ .module = THIS_MODULE,
.hdrsize = 0,
.name = "VFS_DQUOT",
.version = 1,
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2d445425aad7..07e08c7d05ca 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -80,7 +80,7 @@ unsigned int qtype_enforce_flag(int type)
}
static int quota_quotaon(struct super_block *sb, int type, qid_t id,
- struct path *path)
+ const struct path *path)
{
if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
return -ENOSYS;
@@ -104,13 +104,9 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
{
__u32 fmt;
- mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_active(sb, type)) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ if (!sb_has_quota_active(sb, type))
return -ESRCH;
- }
fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
if (copy_to_user(addr, &fmt, sizeof(fmt)))
return -EFAULT;
return 0;
@@ -700,7 +696,7 @@ static int quota_rmxquota(struct super_block *sb, void __user *addr)
/* Copy parameters and call proper function */
static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
- void __user *addr, struct path *path)
+ void __user *addr, const struct path *path)
{
int ret;
@@ -789,9 +785,14 @@ static int quotactl_cmd_write(int cmd)
}
return 1;
}
-
#endif /* CONFIG_BLOCK */
+/* Return true if quotactl command is manipulating quota on/off state */
+static bool quotactl_cmd_onoff(int cmd)
+{
+ return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF);
+}
+
/*
* look up a superblock on which quota ops will be performed
* - use the name of a block device to find the superblock thereon
@@ -809,7 +810,9 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
putname(tmp);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- if (quotactl_cmd_write(cmd))
+ if (quotactl_cmd_onoff(cmd))
+ sb = get_super_exclusive_thawed(bdev);
+ else if (quotactl_cmd_write(cmd))
sb = get_super_thawed(bdev);
else
sb = get_super(bdev);
@@ -872,7 +875,10 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
ret = do_quotactl(sb, type, cmds, id, addr, pathp);
- drop_super(sb);
+ if (!quotactl_cmd_onoff(cmds))
+ drop_super(sb);
+ else
+ drop_super_exclusive(sb);
out:
if (pathp && !IS_ERR(pathp))
path_put(pathp);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2bcbf4e77982..2ef7ce75c062 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -23,7 +23,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8621c039b536..26e45863e499 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,7 +35,7 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
#define RAMFS_DEFAULT_MODE 0755
diff --git a/fs/read_write.c b/fs/read_write.c
index 190e0d362581..5816d4c4cab0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -20,7 +20,7 @@
#include <linux/fs.h>
#include "internal.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
@@ -1538,28 +1538,43 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (len == 0)
return 0;
- ret = mnt_want_write_file(file_out);
- if (ret)
- return ret;
+ sb_start_write(inode_out->i_sb);
- ret = -EOPNOTSUPP;
- if (file_out->f_op->copy_file_range)
+ /*
+ * Try cloning first, this is supported by more file systems, and
+ * more efficient if both clone and copy are supported (e.g. NFS).
+ */
+ if (file_in->f_op->clone_file_range) {
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (ret == 0) {
+ ret = len;
+ goto done;
+ }
+ }
+
+ if (file_out->f_op->copy_file_range) {
ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
pos_out, len, flags);
- if (ret == -EOPNOTSUPP)
- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+ if (ret != -EOPNOTSUPP)
+ goto done;
+ }
+
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+done:
if (ret > 0) {
fsnotify_access(file_in);
add_rchar(current, ret);
fsnotify_modify(file_out);
add_wchar(current, ret);
}
+
inc_syscr(current);
inc_syscw(current);
- mnt_drop_write_file(file_out);
+ sb_end_write(inode_out->i_sb);
return ret;
}
@@ -1650,6 +1665,115 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}
+/*
+ * Check that the two inodes are eligible for cloning, the ranges make
+ * sense, and then flush all dirty data. Caller must ensure that the
+ * inodes have been locked against any other modifications.
+ *
+ * Returns: 0 for "nothing to clone", 1 for "something to clone", or
+ * the usual negative error code.
+ */
+int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+ struct inode *inode_out, loff_t pos_out,
+ u64 *len, bool is_dedupe)
+{
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ loff_t blen;
+ loff_t isize;
+ bool same_inode = (inode_in == inode_out);
+ int ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0)
+ return 0;
+
+ /* Zero length dedupe exits immediately; reflink goes to EOF. */
+ if (*len == 0) {
+ if (is_dedupe || pos_in == isize)
+ return 0;
+ if (pos_in > isize)
+ return -EINVAL;
+ *len = isize - pos_in;
+ }
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
+ pos_in + *len > isize)
+ return -EINVAL;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + *len > disize)
+ return -EINVAL;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + *len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = *len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ return -EINVAL;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ return -EINVAL;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + *len - 1);
+ if (ret)
+ return ret;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + *len - 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (is_dedupe) {
+ bool is_same = false;
+
+ ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same);
+ if (ret)
+ return ret;
+ if (!is_same)
+ return -EBADE;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+
int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
{
@@ -1657,15 +1781,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct inode *inode_out = file_inode(file_out);
int ret;
- if (inode_in->i_sb != inode_out->i_sb ||
- file_in->f_path.mnt != file_out->f_path.mnt)
- return -EXDEV;
-
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
+ /*
+ * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+ * the same mount. Practically, they only need to be on the same file
+ * system.
+ */
+ if (inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
@@ -1685,10 +1813,6 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
if (pos_in + len > i_size_read(inode_in))
return -EINVAL;
- ret = mnt_want_write_file(file_out);
- if (ret)
- return ret;
-
ret = file_in->f_op->clone_file_range(file_in, pos_in,
file_out, pos_out, len);
if (!ret) {
@@ -1696,11 +1820,106 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
fsnotify_modify(file_out);
}
- mnt_drop_write_file(file_out);
return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dest, loff_t destoff,
+ loff_t len, bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0)
+ goto out_error;
+
+ src_page = vfs_dedupe_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = vfs_dedupe_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
+
int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
{
struct file_dedupe_range_info *info;
@@ -1737,6 +1956,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
goto out;
ret = 0;
+ if (off + len > i_size_read(src))
+ return -EINVAL;
+
/* pre-format output fields to sane values */
for (i = 0; i < count; i++) {
same->info[i].bytes_deduped = 0ULL;
diff --git a/fs/readdir.c b/fs/readdir.c
index 9d0212c374d6..0e8a7f355f7a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -19,7 +19,7 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
int iterate_dir(struct file *file, struct dir_context *ctx)
{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 58b2dedb2a3a..cfeae9b0a2b7 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -19,6 +19,7 @@
#include <linux/quotaops.h>
#include <linux/swap.h>
#include <linux/uio.h>
+#include <linux/bio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc2dde2423c2..aa40c242f1db 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s,
mark_buffer_dirty(jl->j_commit_bh) ;
depth = reiserfs_write_unlock_nested(s);
if (reiserfs_barrier_flush(s))
- __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+ __sync_dirty_buffer(jl->j_commit_bh,
+ REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(jl->j_commit_bh);
reiserfs_write_lock_nested(s, depth);
@@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb,
depth = reiserfs_write_unlock_nested(sb);
if (reiserfs_barrier_flush(sb))
- __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+ __sync_dirty_buffer(journal->j_header_bh,
+ REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(journal->j_header_bh);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e6a2b406af36..bd39a998843d 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1665,7 +1665,6 @@ const struct inode_operations reiserfs_dir_inode_operations = {
* stuff added
*/
const struct inode_operations reiserfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index a97e352d05d3..0037aea97d39 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -11,6 +11,7 @@
#include <linux/time.h>
#include <linux/string.h>
#include <linux/pagemap.h>
+#include <linux/bio.h>
#include "reiserfs.h"
#include <linux/buffer_head.h>
#include <linux/quotaops.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0a6ad4e71e88..e314cb30a181 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -802,7 +802,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
static int reiserfs_release_dquot(struct dquot *);
static int reiserfs_mark_dquot_dirty(struct dquot *);
static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
+static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
static const struct dquot_operations reiserfs_quota_operations = {
.write_dquot = reiserfs_write_dquot,
@@ -2348,7 +2348,7 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
* Standard function to be called on quota_on
*/
static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path)
+ const struct path *path)
{
int err;
struct inode *inode;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d0f8a38dfafa..0186fe6d39f3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -74,6 +74,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
+#include <linux/major.h>
#include "internal.h"
static struct kmem_cache *romfs_inode_cachep;
@@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode)
static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ /* When calling huge_encode_dev(),
+ * use sb->s_bdev->bd_dev when,
+ * - CONFIG_ROMFS_ON_BLOCK defined
+ * use sb->s_dev when,
+ * - CONFIG_ROMFS_ON_BLOCK undefined and
+ * - CONFIG_ROMFS_ON_MTD defined
+ * leave id as 0 when,
+ * - CONFIG_ROMFS_ON_BLOCK undefined and
+ * - CONFIG_ROMFS_ON_MTD undefined
+ */
+ if (sb->s_bdev)
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
+ else if (sb->s_dev)
+ id = huge_encode_dev(sb->s_dev);
buf->f_type = ROMFS_MAGIC;
buf->f_namelen = ROMFS_MAXFN;
@@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_RDONLY | MS_NOATIME;
sb->s_op = &romfs_super_ops;
+#ifdef CONFIG_ROMFS_ON_MTD
+ /* Use same dev ID from the underlying mtdblock device */
+ if (sb->s_mtd)
+ sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index);
+#endif
/* read the image superblock and check it */
rsb = kmalloc(512, GFP_KERNEL);
if (!rsb)
diff --git a/fs/select.c b/fs/select.c
index 3d4f85defeab..305c0daf5d67 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -31,7 +31,7 @@
#include <net/busy_poll.h>
#include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 368bfb92b115..ca69fb99e41a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -15,7 +15,7 @@
#include <linux/printk.h>
#include <linux/string_helpers.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
static void seq_set_overflow(struct seq_file *m)
@@ -190,6 +190,13 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
*/
m->version = file->f_version;
+ /*
+ * if request is to read from zero offset, reset iterator to first
+ * record as it might have been already advanced by previous requests
+ */
+ if (*ppos == 0)
+ m->index = 0;
+
/* Don't assume *ppos is where we left it */
if (unlikely(*ppos != m->read_pos)) {
while ((err = traverse(m, *ppos)) == -EAGAIN)
diff --git a/fs/splice.c b/fs/splice.c
index 5a7750bd2eea..4ef78aa8ef61 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -17,6 +17,7 @@
* Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
*
*/
+#include <linux/bvec.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
@@ -203,6 +204,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
buf->len = spd->partial[page_nr].len;
buf->private = spd->partial[page_nr].private;
buf->ops = spd->ops;
+ buf->flags = 0;
pipe->nrbufs++;
page_nr++;
@@ -1086,7 +1088,13 @@ EXPORT_SYMBOL(do_splice_direct);
static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
- while (pipe->nrbufs == pipe->buffers) {
+ for (;;) {
+ if (unlikely(!pipe->readers)) {
+ send_sig(SIGPIPE, current, 0);
+ return -EPIPE;
+ }
+ if (pipe->nrbufs != pipe->buffers)
+ return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
if (signal_pending(current))
@@ -1095,7 +1103,6 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
pipe_wait(pipe);
pipe->waiting_writers--;
}
- return 0;
}
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index ce62a380314f..2751476e6b6e 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -31,6 +31,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 79b9c31a0c8f..befeba0fa70a 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -118,7 +118,6 @@ const struct address_space_operations squashfs_symlink_aops = {
};
const struct inode_operations squashfs_symlink_inode_ops = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.listxattr = squashfs_listxattr
};
diff --git a/fs/stat.c b/fs/stat.c
index bc045c7994e1..a268b7f27adf 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,7 +15,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
void generic_fillattr(struct inode *inode, struct kstat *stat)
@@ -329,12 +329,14 @@ retry:
struct inode *inode = d_backing_inode(path.dentry);
error = empty ? -ENOENT : -EINVAL;
- if (inode->i_op->readlink) {
+ /*
+ * AFS mountpoints allow readlink(2) but are not symlinks
+ */
+ if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
touch_atime(&path);
- error = inode->i_op->readlink(path.dentry,
- buf, bufsiz);
+ error = vfs_readlink(path.dentry, buf, bufsiz);
}
}
path_put(&path);
diff --git a/fs/statfs.c b/fs/statfs.c
index 083dc0ac9140..13ae259d4879 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -63,7 +63,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
return retval;
}
-int vfs_statfs(struct path *path, struct kstatfs *buf)
+int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
int error;
diff --git a/fs/super.c b/fs/super.c
index c183835566c1..1709ed029a2c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -244,7 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
mutex_init(&s->s_vfs_rename_mutex);
lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
mutex_init(&s->s_dquot.dqio_mutex);
- mutex_init(&s->s_dquot.dqonoff_mutex);
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
@@ -558,6 +557,13 @@ void drop_super(struct super_block *sb)
EXPORT_SYMBOL(drop_super);
+void drop_super_exclusive(struct super_block *sb)
+{
+ up_write(&sb->s_umount);
+ put_super(sb);
+}
+EXPORT_SYMBOL(drop_super_exclusive);
+
/**
* iterate_supers - call function for all active superblocks
* @f: function to call
@@ -628,15 +634,7 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
-/**
- * get_super - get the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. %NULL is returned if no match is found.
- */
-
-struct super_block *get_super(struct block_device *bdev)
+static struct super_block *__get_super(struct block_device *bdev, bool excl)
{
struct super_block *sb;
@@ -651,11 +649,17 @@ rescan:
if (sb->s_bdev == bdev) {
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
+ if (!excl)
+ down_read(&sb->s_umount);
+ else
+ down_write(&sb->s_umount);
/* still alive? */
if (sb->s_root && (sb->s_flags & MS_BORN))
return sb;
- up_read(&sb->s_umount);
+ if (!excl)
+ up_read(&sb->s_umount);
+ else
+ up_write(&sb->s_umount);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
@@ -666,32 +670,67 @@ rescan:
return NULL;
}
-EXPORT_SYMBOL(get_super);
-
/**
- * get_super_thawed - get thawed superblock of a device
+ * get_super - get the superblock of a device
* @bdev: device to get the superblock for
*
* Scans the superblock list and finds the superblock of the file system
- * mounted on the device. The superblock is returned once it is thawed
- * (or immediately if it was not frozen). %NULL is returned if no match
- * is found.
+ * mounted on the device given. %NULL is returned if no match is found.
*/
-struct super_block *get_super_thawed(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
+{
+ return __get_super(bdev, false);
+}
+EXPORT_SYMBOL(get_super);
+
+static struct super_block *__get_super_thawed(struct block_device *bdev,
+ bool excl)
{
while (1) {
- struct super_block *s = get_super(bdev);
+ struct super_block *s = __get_super(bdev, excl);
if (!s || s->s_writers.frozen == SB_UNFROZEN)
return s;
- up_read(&s->s_umount);
+ if (!excl)
+ up_read(&s->s_umount);
+ else
+ up_write(&s->s_umount);
wait_event(s->s_writers.wait_unfrozen,
s->s_writers.frozen == SB_UNFROZEN);
put_super(s);
}
}
+
+/**
+ * get_super_thawed - get thawed superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device. The superblock is returned once it is thawed
+ * (or immediately if it was not frozen). %NULL is returned if no match
+ * is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+ return __get_super_thawed(bdev, false);
+}
EXPORT_SYMBOL(get_super_thawed);
/**
+ * get_super_exclusive_thawed - get thawed superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device. The superblock is returned once it is thawed
+ * (or immediately if it was not frozen) and s_umount semaphore is held
+ * in exclusive mode. %NULL is returned if no match is found.
+ */
+struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
+{
+ return __get_super_thawed(bdev, true);
+}
+EXPORT_SYMBOL(get_super_exclusive_thawed);
+
+/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d62c423a5a2d..858fb72f9e0f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -145,7 +145,6 @@ static inline void write3byte(struct sysv_sb_info *sbi,
}
static const struct inode_operations sysv_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = page_get_link,
.getattr = sysv_getattr,
};
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9ae4abb4110b..c173cc196175 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -55,7 +55,7 @@ static inline bool isalarm(struct timerfd_ctx *ctx)
/*
* This gets called when the timer event triggers. We set the "expired"
* flag, but we do not re-arm the timer (in case it's necessary,
- * tintv.tv64 != 0) until the timer is accessed.
+ * tintv != 0) until the timer is accessed.
*/
static void timerfd_triggered(struct timerfd_ctx *ctx)
{
@@ -93,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
*/
void timerfd_clock_was_set(void)
{
- ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+ ktime_t moffs = ktime_mono_to_real(0);
struct timerfd_ctx *ctx;
unsigned long flags;
@@ -102,8 +102,8 @@ void timerfd_clock_was_set(void)
if (!ctx->might_cancel)
continue;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->moffs.tv64 != moffs.tv64) {
- ctx->moffs.tv64 = KTIME_MAX;
+ if (ctx->moffs != moffs) {
+ ctx->moffs = KTIME_MAX;
ctx->ticks++;
wake_up_locked(&ctx->wqh);
}
@@ -124,9 +124,9 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
static bool timerfd_canceled(struct timerfd_ctx *ctx)
{
- if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+ if (!ctx->might_cancel || ctx->moffs != KTIME_MAX)
return false;
- ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+ ctx->moffs = ktime_mono_to_real(0);
return true;
}
@@ -155,7 +155,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
else
remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
- return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+ return remaining < 0 ? 0: remaining;
}
static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
@@ -184,7 +184,7 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->t.tmr.function = timerfd_tmrproc;
}
- if (texp.tv64 != 0) {
+ if (texp != 0) {
if (isalarm(ctx)) {
if (flags & TFD_TIMER_ABSTIME)
alarm_start(&ctx->t.alarm, texp);
@@ -261,9 +261,9 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
if (ctx->ticks) {
ticks = ctx->ticks;
- if (ctx->expired && ctx->tintv.tv64) {
+ if (ctx->expired && ctx->tintv) {
/*
- * If tintv.tv64 != 0, this is a periodic timer that
+ * If tintv != 0, this is a periodic timer that
* needs to be re-armed. We avoid doing it in the timer
* callback to avoid DoS attacks specifying a very
* short timer period.
@@ -410,7 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
else
hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
- ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+ ctx->moffs = ktime_mono_to_real(0);
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -469,7 +469,7 @@ static int do_timerfd_settime(int ufd, int flags,
* We do not update "ticks" and "expired" since the timer will be
* re-programmed again in the following timerfd_setup() call.
*/
- if (ctx->expired && ctx->tintv.tv64) {
+ if (ctx->expired && ctx->tintv) {
if (isalarm(ctx))
alarm_forward_now(&ctx->t.alarm, ctx->tintv);
else
@@ -499,7 +499,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
ctx = f.file->private_data;
spin_lock_irq(&ctx->wqh.lock);
- if (ctx->expired && ctx->tintv.tv64) {
+ if (ctx->expired && ctx->tintv) {
ctx->expired = 0;
if (isalarm(ctx)) {
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 7ff7712f284e..b0d0623c83ed 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -50,3 +50,14 @@ config UBIFS_ATIME_SUPPORT
strictatime is the "heavy", relatime is "lighter", etc.
If unsure, say 'N'
+
+config UBIFS_FS_ENCRYPTION
+ bool "UBIFS Encryption"
+ depends on UBIFS_FS && BLOCK
+ select FS_ENCRYPTION
+ default n
+ help
+ Enable encryption of UBIFS files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index c54a24360f85..6f3251c2bf08 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -5,3 +5,4 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
ubifs-y += misc.o
+ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
new file mode 100644
index 000000000000..3402720f2b28
--- /dev/null
+++ b/fs/ubifs/crypto.c
@@ -0,0 +1,97 @@
+#include "ubifs.h"
+
+static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ubifs_xattr_get(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len);
+}
+
+static int ubifs_crypt_set_context(struct inode *inode, const void *ctx,
+ size_t len, void *fs_data)
+{
+ return ubifs_xattr_set(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+}
+
+static bool ubifs_crypt_empty_dir(struct inode *inode)
+{
+ return ubifs_check_dir_empty(inode) == 0;
+}
+
+static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
+{
+ if (S_ISLNK(inode->i_mode))
+ return UBIFS_MAX_INO_DATA;
+ else
+ return UBIFS_MAX_NLEN;
+}
+
+static int ubifs_key_prefix(struct inode *inode, u8 **key)
+{
+ static char prefix[] = "ubifs:";
+
+ *key = prefix;
+
+ return sizeof(prefix) - 1;
+}
+
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len, int block)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ void *p = &dn->data;
+ struct page *ret;
+ unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
+
+ ubifs_assert(pad_len <= *out_len);
+ dn->compr_size = cpu_to_le16(in_len);
+
+ /* pad to full block cipher length */
+ if (pad_len != in_len)
+ memset(p + in_len, 0, pad_len - in_len);
+
+ ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len,
+ offset_in_page(&dn->data), block, GFP_NOFS);
+ if (IS_ERR(ret)) {
+ ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret));
+ return PTR_ERR(ret);
+ }
+ *out_len = pad_len;
+
+ return 0;
+}
+
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int *out_len, int block)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err;
+ unsigned int clen = le16_to_cpu(dn->compr_size);
+ unsigned int dlen = *out_len;
+
+ if (clen <= 0 || clen > UBIFS_BLOCK_SIZE || clen > dlen) {
+ ubifs_err(c, "bad compr_size: %i", clen);
+ return -EINVAL;
+ }
+
+ ubifs_assert(dlen <= UBIFS_BLOCK_SIZE);
+ err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen,
+ offset_in_page(&dn->data), block);
+ if (err) {
+ ubifs_err(c, "fscrypt_decrypt_page failed: %i", err);
+ return err;
+ }
+ *out_len = clen;
+
+ return 0;
+}
+
+struct fscrypt_operations ubifs_crypt_operations = {
+ .flags = FS_CFLG_OWN_PAGES,
+ .get_context = ubifs_crypt_get_context,
+ .set_context = ubifs_crypt_set_context,
+ .is_encrypted = __ubifs_crypt_is_encrypted,
+ .empty_dir = ubifs_crypt_empty_dir,
+ .max_namelen = ubifs_crypt_max_namelen,
+ .key_prefix = ubifs_key_prefix,
+};
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 69e287e20732..1e712a364680 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -233,7 +233,7 @@ static void dump_ch(const struct ubifs_ch *ch)
void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
{
const struct ubifs_inode *ui = ubifs_inode(inode);
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
union ubifs_key key;
struct ubifs_dent_node *dent, *pdent = NULL;
int count = 2;
@@ -289,8 +289,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\t%d: %s (%s)\n",
count++, dent->name, get_dent_type(dent->type));
- nm.name = dent->name;
- nm.len = le16_to_cpu(dent->nlen);
+ fname_name(&nm) = dent->name;
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
kfree(pdent);
pdent = dent;
key_read(c, &dent->key, &key);
@@ -1107,7 +1107,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
unsigned int nlink = 2;
union ubifs_key key;
struct ubifs_dent_node *dent, *pdent = NULL;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
loff_t size = UBIFS_INO_NODE_SZ;
if (!dbg_is_chk_gen(c))
@@ -1128,9 +1128,9 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
return err;
}
- nm.name = dent->name;
- nm.len = le16_to_cpu(dent->nlen);
- size += CALC_DENT_SIZE(nm.len);
+ fname_name(&nm) = dent->name;
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
+ size += CALC_DENT_SIZE(fname_len(&nm));
if (dent->type == UBIFS_ITYPE_DIR)
nlink += 1;
kfree(pdent);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ca16c5d7bab1..528369f3e472 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -85,11 +85,26 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
* initializes it. Returns new inode in case of success and an error code in
* case of failure.
*/
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
umode_t mode)
{
+ int err;
struct inode *inode;
struct ubifs_inode *ui;
+ bool encrypted = false;
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err) {
+ ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
+ return ERR_PTR(err);
+ }
+
+ if (!fscrypt_has_encryption_key(dir))
+ return ERR_PTR(-EPERM);
+
+ encrypted = true;
+ }
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
@@ -165,18 +180,29 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
*/
ui->creat_sqnum = ++c->max_sqnum;
spin_unlock(&c->cnt_lock);
+
+ if (encrypted) {
+ err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+ if (err) {
+ ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ }
+
return inode;
}
static int dbg_check_name(const struct ubifs_info *c,
const struct ubifs_dent_node *dent,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
if (!dbg_is_chk_gen(c))
return 0;
- if (le16_to_cpu(dent->nlen) != nm->len)
+ if (le16_to_cpu(dent->nlen) != fname_len(nm))
return -EINVAL;
- if (memcmp(dent->name, nm->name, nm->len))
+ if (memcmp(dent->name, fname_name(nm), fname_len(nm)))
return -EINVAL;
return 0;
}
@@ -189,30 +215,61 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct ubifs_dent_node *dent;
struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm;
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
- if (dentry->d_name.len > UBIFS_MAX_NLEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+
+ /*
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+ * created while the directory was encrypted and we
+ * have access to the key.
+ */
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (err && err != -ENOKEY)
+ return ERR_PTR(err);
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+ if (err)
+ return ERR_PTR(err);
+
+ if (fname_len(&nm) > UBIFS_MAX_NLEN) {
+ err = -ENAMETOOLONG;
+ goto out_fname;
+ }
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
- if (!dent)
- return ERR_PTR(-ENOMEM);
+ if (!dent) {
+ err = -ENOMEM;
+ goto out_fname;
+ }
- dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+ if (nm.hash) {
+ ubifs_assert(fname_len(&nm) == 0);
+ ubifs_assert(fname_name(&nm) == NULL);
+ dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
+ err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
+ } else {
+ dent_key_init(c, &key, dir->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, dent, &nm);
+ }
- err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
if (err) {
if (err == -ENOENT) {
dbg_gen("not found");
goto done;
}
- goto out;
+ goto out_dent;
}
- if (dbg_check_name(c, dent, &dentry->d_name)) {
+ if (dbg_check_name(c, dent, &nm)) {
err = -EINVAL;
- goto out;
+ goto out_dent;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -225,11 +282,12 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
- goto out;
+ goto out_dent;
}
done:
kfree(dent);
+ fscrypt_free_filename(&nm);
/*
* Note, d_splice_alias() would be required instead if we supported
* NFS.
@@ -237,8 +295,10 @@ done:
d_add(dentry, inode);
return NULL;
-out:
+out_dent:
kfree(dent);
+out_fname:
+ fscrypt_free_filename(&nm);
return ERR_PTR(err);
}
@@ -247,10 +307,11 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct fscrypt_name nm;
+ int err, sz_change;
/*
* Budget request settings: new inode, new direntry, changing the
@@ -264,10 +325,16 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (err)
return err;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, mode);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -278,12 +345,13 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
return 0;
@@ -295,6 +363,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
ubifs_err(c, "cannot create regular file, error %d", err);
@@ -310,6 +380,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
int err, instantiated = 0;
+ struct fscrypt_name nm;
/*
* Budget request settings: new dirty inode, new direntry,
@@ -319,13 +390,20 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
- err = ubifs_budget_space(c, &req);
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ fscrypt_free_filename(&nm);
+ return err;
+ }
+
err = ubifs_budget_space(c, &ino_req);
if (err) {
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
return err;
}
@@ -361,7 +439,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&dir_ui->ui_mutex);
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -380,6 +458,7 @@ out_budg:
ubifs_release_budget(c, &req);
if (!instantiated)
ubifs_release_budget(c, &ino_req);
+ fscrypt_free_filename(&nm);
ubifs_err(c, "cannot create temporary file, error %d", err);
return err;
}
@@ -439,12 +518,14 @@ static unsigned int vfs_dent_type(uint8_t type)
*/
static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
- int err = 0;
- struct qstr nm;
+ int fstr_real_len = 0, err = 0;
+ struct fscrypt_name nm;
+ struct fscrypt_str fstr = {0};
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
+ bool encrypted = ubifs_crypt_is_encrypted(dir);
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@@ -455,6 +536,18 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
*/
return 0;
+ if (encrypted) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+
+ err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+ if (err)
+ return err;
+
+ fstr_real_len = fstr.len;
+ }
+
if (file->f_version == 0) {
/*
* The file was seek'ed, which means that @file->private_data
@@ -476,12 +569,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
/* File positions 0 and 1 correspond to "." and ".." */
if (ctx->pos < 2) {
ubifs_assert(!file->private_data);
- if (!dir_emit_dots(file, ctx))
+ if (!dir_emit_dots(file, ctx)) {
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
return 0;
+ }
/* Find the first entry in TNC and save it */
lowest_dent_key(c, &key, dir->i_ino);
- nm.name = NULL;
+ fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -499,7 +595,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
* Find the entry corresponding to @ctx->pos or the closest one.
*/
dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
- nm.name = NULL;
+ fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -516,15 +612,33 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
ubifs_inode(dir)->creat_sqnum);
- nm.len = le16_to_cpu(dent->nlen);
- if (!dir_emit(ctx, dent->name, nm.len,
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
+ fname_name(&nm) = dent->name;
+
+ if (encrypted) {
+ fstr.len = fstr_real_len;
+
+ err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c,
+ &dent->key),
+ le32_to_cpu(dent->cookie),
+ &nm.disk_name, &fstr);
+ if (err)
+ goto out;
+ } else {
+ fstr.len = fname_len(&nm);
+ fstr.name = fname_name(&nm);
+ }
+
+ if (!dir_emit(ctx, fstr.name, fstr.len,
le64_to_cpu(dent->inum),
- vfs_dent_type(dent->type)))
+ vfs_dent_type(dent->type))) {
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
return 0;
+ }
/* Switch to the next entry */
key_read(c, &dent->key, &key);
- nm.name = dent->name;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -541,6 +655,9 @@ out:
kfree(file->private_data);
file->private_data = NULL;
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
+
if (err != -ENOENT)
ubifs_err(c, "cannot find next direntry, error %d", err);
else
@@ -601,6 +718,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ struct fscrypt_name nm;
/*
* Budget request settings: new direntry, changing the target inode,
@@ -613,13 +731,21 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
- err = dbg_check_synced_i_size(c, inode);
+ if (ubifs_crypt_is_encrypted(dir) &&
+ !fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
+ err = dbg_check_synced_i_size(c, inode);
+ if (err)
+ goto out_fname;
+
err = ubifs_budget_space(c, &req);
if (err)
- return err;
+ goto out_fname;
lock_2_inodes(dir, inode);
inc_nlink(inode);
@@ -628,13 +754,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -644,6 +771,8 @@ out_cancel:
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
@@ -652,10 +781,10 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
- int err, budgeted = 1;
+ int err, sz_change, budgeted = 1;
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
unsigned int saved_nlink = inode->i_nlink;
+ struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode (+1 for
@@ -667,16 +796,29 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+ if (err)
+ return err;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
- return err;
+ goto out_fname;
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
- return err;
+ goto out_fname;
budgeted = 0;
}
@@ -686,7 +828,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
@@ -698,6 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -707,21 +850,23 @@ out_cancel:
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
/**
* check_dir_empty - check if a directory is empty or not.
- * @c: UBIFS file-system description object
* @dir: VFS inode object of the directory to check
*
* This function checks if directory @dir is empty. Returns zero if the
* directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
* in case of of errors.
*/
-static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+int ubifs_check_dir_empty(struct inode *dir)
{
- struct qstr nm = { .name = NULL };
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm = { 0 };
struct ubifs_dent_node *dent;
union ubifs_key key;
int err;
@@ -743,10 +888,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
- int err, budgeted = 1;
+ int err, sz_change, budgeted = 1;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode and
@@ -758,14 +903,26 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino, dir->i_ino);
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
- err = check_dir_empty(c, d_inode(dentry));
+ err = ubifs_check_dir_empty(d_inode(dentry));
+ if (err)
+ return err;
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
if (err)
return err;
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
- return err;
+ goto out_fname;
budgeted = 0;
}
@@ -776,7 +933,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
@@ -788,6 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -798,6 +956,8 @@ out_cancel:
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
@@ -806,8 +966,9 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, sz_change;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -821,10 +982,16 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
return err;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -838,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
goto out_cancel;
@@ -847,6 +1014,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -857,6 +1025,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -870,11 +1040,12 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
union ubifs_dev_desc *dev = NULL;
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int sz_change;
int err, devlen = 0;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.new_ino_d = ALIGN(devlen, 8),
.dirtied_ino = 1 };
+ struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -896,11 +1067,17 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
return err;
}
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, mode);
if (IS_ERR(inode)) {
kfree(dev);
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
init_special_inode(inode, inode->i_mode, rdev);
@@ -917,7 +1094,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -925,6 +1102,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
ubifs_release_budget(c, &req);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -934,6 +1112,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -947,10 +1127,27 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, len = strlen(symname);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int sz_change = CALC_DENT_SIZE(len);
+ struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+ struct fscrypt_symlink_data *sd = NULL;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.new_ino_d = ALIGN(len, 8),
.dirtied_ino = 1 };
+ struct fscrypt_name nm;
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ goto out_budg;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ err = -EPERM;
+ goto out_budg;
+ }
+
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
+ }
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -960,36 +1157,65 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
symname, dir->i_ino);
- if (len > UBIFS_MAX_INO_DATA)
+ if (disk_link.len > UBIFS_MAX_INO_DATA)
return -ENAMETOOLONG;
err = ubifs_budget_space(c, &req);
if (err)
return err;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
ui = ubifs_inode(inode);
- ui->data = kmalloc(len + 1, GFP_NOFS);
+ ui->data = kmalloc(disk_link.len, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_inode;
}
- memcpy(ui->data, symname, len);
- ((char *)ui->data)[len] = '\0';
- inode->i_link = ui->data;
+ if (ubifs_crypt_is_encrypted(dir)) {
+ struct qstr istr = QSTR_INIT(symname, len);
+ struct fscrypt_str ostr;
+
+ sd = kzalloc(disk_link.len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
+ goto out_inode;
+ }
+
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err) {
+ kfree(sd);
+ goto out_inode;
+ }
+
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *)sd;
+ } else {
+ inode->i_link = ui->data;
+ }
+
+ memcpy(ui->data, disk_link.name, disk_link.len);
+ ((char *)ui->data)[disk_link.len - 1] = '\0';
+
/*
* The terminating zero byte is not written to the flash media and it
* is put just to make later in-memory string processing simpler. Thus,
* data length is @len, not @len + %1.
*/
- ui->data_len = len;
- inode->i_size = ubifs_inode(inode)->ui_size = len;
+ ui->data_len = disk_link.len - 1;
+ inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
@@ -999,7 +1225,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -1007,6 +1233,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
ubifs_release_budget(c, &req);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -1016,6 +1243,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -1078,15 +1307,14 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_inode *whiteout_ui = NULL;
int err, release, sync = 0, move = (new_dir != old_dir);
int is_dir = S_ISDIR(old_inode->i_mode);
- int unlink = !!new_inode;
- int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
- int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+ int unlink = !!new_inode, new_sz, old_sz;
struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
.dirtied_ino = 3 };
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
unsigned int uninitialized_var(saved_nlink);
+ struct fscrypt_name old_nm, new_nm;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1107,17 +1335,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink)
ubifs_assert(inode_is_locked(new_inode));
+ if (old_dir != new_dir) {
+ if (ubifs_crypt_is_encrypted(new_dir) &&
+ !fscrypt_has_permitted_context(new_dir, old_inode))
+ return -EPERM;
+ }
+
if (unlink && is_dir) {
- err = check_dir_empty(c, new_inode);
+ err = ubifs_check_dir_empty(new_inode);
if (err)
return err;
}
- err = ubifs_budget_space(c, &req);
+ err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm);
if (err)
return err;
+
+ err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm);
+ if (err) {
+ fscrypt_free_filename(&old_nm);
+ return err;
+ }
+
+ new_sz = CALC_DENT_SIZE(fname_len(&new_nm));
+ old_sz = CALC_DENT_SIZE(fname_len(&old_nm));
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
+ return err;
+ }
err = ubifs_budget_space(c, &ino_req);
if (err) {
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
ubifs_release_budget(c, &req);
return err;
}
@@ -1239,8 +1491,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
iput(whiteout);
}
- err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout,
- sync);
+ err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
+ new_inode, &new_nm, whiteout, sync);
if (err)
goto out_cancel;
@@ -1256,6 +1508,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
return err;
out_cancel:
@@ -1284,6 +1539,8 @@ out_cancel:
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
return err;
}
@@ -1298,9 +1555,27 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *snd_inode = d_inode(new_dentry);
struct timespec time;
int err;
+ struct fscrypt_name fst_nm, snd_nm;
ubifs_assert(fst_inode && snd_inode);
+ if ((ubifs_crypt_is_encrypted(old_dir) ||
+ ubifs_crypt_is_encrypted(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!fscrypt_has_permitted_context(new_dir, fst_inode) ||
+ !fscrypt_has_permitted_context(old_dir, snd_inode)))
+ return -EPERM;
+
+ err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
+ if (err)
+ return err;
+
+ err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm);
+ if (err) {
+ fscrypt_free_filename(&fst_nm);
+ return err;
+ }
+
lock_4_inodes(old_dir, new_dir, NULL, NULL);
time = ubifs_current_time(old_dir);
@@ -1320,12 +1595,14 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry,
- sync);
+ err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir,
+ snd_inode, &snd_nm, sync);
unlock_4_inodes(old_dir, new_dir, NULL, NULL);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&fst_nm);
+ fscrypt_free_filename(&snd_nm);
return err;
}
@@ -1384,6 +1661,14 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+static int ubifs_dir_open(struct inode *dir, struct file *file)
+{
+ if (ubifs_crypt_is_encrypted(dir))
+ return fscrypt_get_encryption_info(dir) ? -EACCES : 0;
+
+ return 0;
+}
+
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
@@ -1410,6 +1695,7 @@ const struct file_operations ubifs_dir_operations = {
.iterate_shared = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
+ .open = ubifs_dir_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b4fbeefba246..b0d783774c96 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -78,6 +78,13 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
goto dump;
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, block);
+ if (err)
+ goto dump;
+ }
+
out_len = UBIFS_BLOCK_SIZE;
err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
@@ -650,6 +657,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
out_len = UBIFS_BLOCK_SIZE;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, page_block);
+ if (err)
+ goto out_err;
+ }
+
err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
if (err || len != out_len)
@@ -1594,6 +1608,15 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
int err;
+ struct inode *inode = file->f_mapping->host;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
err = generic_file_mmap(file, vma);
if (err)
@@ -1605,6 +1628,88 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int ubifs_file_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+ struct dentry *dir;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ ret = fscrypt_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+
+ dir = dget_parent(file_dentry(filp));
+ if (ubifs_crypt_is_encrypted(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu",
+ (unsigned long) d_inode(dir)->i_ino,
+ (unsigned long) inode->i_ino);
+ dput(dir);
+ ubifs_ro_mode(c, -EPERM);
+ return -EPERM;
+ }
+ dput(dir);
+
+ return 0;
+}
+
+static const char *ubifs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ int err;
+ struct fscrypt_symlink_data *sd;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct fscrypt_str cstr;
+ struct fscrypt_str pstr;
+
+ if (!ubifs_crypt_is_encrypted(inode))
+ return ui->data;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return ERR_PTR(err);
+
+ sd = (struct fscrypt_symlink_data *)ui->data;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le16_to_cpu(sd->len);
+
+ if (cstr.len == 0)
+ return ERR_PTR(-ENOENT);
+
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len)
+ return ERR_PTR(-EIO);
+
+ err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ if (err)
+ return ERR_PTR(err);
+
+ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+ if (err) {
+ fscrypt_fname_free_buffer(&pstr);
+ return ERR_PTR(err);
+ }
+
+ pstr.name[pstr.len] = '\0';
+
+ // XXX this probably won't happen anymore...
+ if (pstr.name[0] == '\0') {
+ fscrypt_fname_free_buffer(&pstr);
+ return ERR_PTR(-ENOENT);
+ }
+
+ set_delayed_call(done, kfree_link, pstr.name);
+ return pstr.name;
+}
+
+
const struct address_space_operations ubifs_file_address_operations = {
.readpage = ubifs_readpage,
.writepage = ubifs_writepage,
@@ -1628,8 +1733,7 @@ const struct inode_operations ubifs_file_inode_operations = {
};
const struct inode_operations ubifs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .get_link = simple_get_link,
+ .get_link = ubifs_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.listxattr = ubifs_listxattr,
@@ -1647,6 +1751,7 @@ const struct file_operations ubifs_file_operations = {
.unlocked_ioctl = ubifs_ioctl,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .open = ubifs_file_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e845c64b6ce1..7b35e3d6cde7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -846,10 +846,6 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
*/
while (1) {
lp = ubifs_fast_find_freeable(c);
- if (IS_ERR(lp)) {
- err = PTR_ERR(lp);
- goto out;
- }
if (!lp)
break;
ubifs_assert(!(lp->flags & LPROPS_TAKEN));
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 97be41215332..3be28900bf37 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -452,16 +452,22 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
*/
static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
{
+ ktime_t softlimit = ms_to_ktime(dirty_writeback_interval * 10);
+ unsigned long long delta = dirty_writeback_interval;
+
+ /* centi to milli, milli to nano, then 10% */
+ delta *= 10ULL * NSEC_PER_MSEC / 10ULL;
+
ubifs_assert(!hrtimer_active(&wbuf->timer));
+ ubifs_assert(delta <= ULONG_MAX);
if (wbuf->no_timer)
return;
dbg_io("set timer for jhead %s, %llu-%llu millisecs",
dbg_jhead(wbuf->jhead),
- div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
- div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
- USEC_PER_SEC));
- hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+ div_u64(ktime_to_ns(softlimit), USEC_PER_SEC),
+ div_u64(ktime_to_ns(softlimit) + delta, USEC_PER_SEC));
+ hrtimer_start_range_ns(&wbuf->timer, softlimit, delta,
HRTIMER_MODE_REL);
}
@@ -1059,10 +1065,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
wbuf->timer.function = wbuf_timer_callback_nolock;
- wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
- wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
- wbuf->delta *= 1000000000ULL;
- ubifs_assert(wbuf->delta <= ULONG_MAX);
return 0;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 3c7b29de0ca7..da519ba205f6 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -181,6 +181,26 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
mnt_drop_write_file(file);
return err;
}
+ case FS_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ err = ubifs_enable_encryption(c);
+ if (err)
+ return err;
+
+ return fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+ case FS_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
default:
return -ENOTTY;
@@ -197,6 +217,9 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC32_SETFLAGS:
cmd = FS_IOC_SETFLAGS;
break;
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 91bc76dc559e..294519b98874 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -78,16 +78,6 @@ static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
{
dent->padding1 = 0;
- memset(dent->padding2, 0, 4);
-}
-
-/**
- * zero_data_node_unused - zero out unused fields of an on-flash data node.
- * @data: the data node to zero out
- */
-static inline void zero_data_node_unused(struct ubifs_data_node *data)
-{
- memset(data->padding, 0, 2);
}
/**
@@ -511,6 +501,14 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
ui->dirty = 0;
}
+static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
+{
+ if (c->double_hash)
+ dent->cookie = prandom_u32();
+ else
+ dent->cookie = 0;
+}
+
/**
* ubifs_jnl_update - update inode.
* @c: UBIFS file-system description object
@@ -539,7 +537,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
* success. In case of failure, a negative error code is returned.
*/
int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
- const struct qstr *nm, const struct inode *inode,
+ const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent)
{
int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
@@ -551,11 +549,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
struct ubifs_ino_node *ino;
union ubifs_key dent_key, ino_key;
- dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
- inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+ //dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+ // inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
- dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ dlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
ilen = UBIFS_INO_NODE_SZ;
/*
@@ -596,9 +594,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
key_write(c, &dent_key, dent->key);
dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
dent->type = get_dent_type(inode->i_mode);
- dent->nlen = cpu_to_le16(nm->len);
- memcpy(dent->name, nm->name, nm->len);
- dent->name[nm->len] = '\0';
+ dent->nlen = cpu_to_le16(fname_len(nm));
+ memcpy(dent->name, fname_name(nm), fname_len(nm));
+ dent->name[fname_len(nm)] = '\0';
+ set_dent_cookie(c, dent);
+
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen, 0);
@@ -697,14 +697,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len)
{
struct ubifs_data_node *data;
- int err, lnum, offs, compr_type, out_len;
+ int err, lnum, offs, compr_type, out_len, compr_len;
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
+ bool encrypted = ubifs_crypt_is_encrypted(inode);
dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
(unsigned long)key_inum(c, key), key_block(c, key), len);
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+ if (encrypted)
+ dlen += UBIFS_CIPHER_BLOCK_SIZE;
+
data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
if (!data) {
/*
@@ -722,7 +726,6 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
data->ch.node_type = UBIFS_DATA_NODE;
key_write(c, key, &data->key);
data->size = cpu_to_le32(len);
- zero_data_node_unused(data);
if (!(ui->flags & UBIFS_COMPR_FL))
/* Compression is disabled for this inode */
@@ -730,9 +733,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
else
compr_type = ui->compr_type;
- out_len = dlen - UBIFS_DATA_NODE_SZ;
- ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type);
- ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+ out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ;
+ ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type);
+ ubifs_assert(compr_len <= UBIFS_BLOCK_SIZE);
+
+ if (encrypted) {
+ err = ubifs_encrypt(inode, data, compr_len, &out_len, key_block(c, key));
+ if (err)
+ goto out_free;
+
+ } else {
+ data->compr_size = 0;
+ out_len = compr_len;
+ }
dlen = UBIFS_DATA_NODE_SZ + out_len;
data->compr_type = cpu_to_le16(compr_type);
@@ -911,9 +924,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
* ubifs_jnl_xrename - cross rename two directory entries.
* @c: UBIFS file-system description object
* @fst_dir: parent inode of 1st directory entry to exchange
- * @fst_dentry: 1st directory entry to exchange
+ * @fst_inode: 1st inode to exchange
+ * @fst_nm: name of 1st inode to exchange
* @snd_dir: parent inode of 2nd directory entry to exchange
- * @snd_dentry: 2nd directory entry to exchange
+ * @snd_inode: 2nd inode to exchange
+ * @snd_nm: name of 2nd inode to exchange
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the cross rename operation which may involve
@@ -922,29 +937,29 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
* returned.
*/
int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
- const struct dentry *fst_dentry,
+ const struct inode *fst_inode,
+ const struct fscrypt_name *fst_nm,
const struct inode *snd_dir,
- const struct dentry *snd_dentry, int sync)
+ const struct inode *snd_inode,
+ const struct fscrypt_name *snd_nm, int sync)
{
union ubifs_key key;
struct ubifs_dent_node *dent1, *dent2;
int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ;
int aligned_dlen1, aligned_dlen2;
int twoparents = (fst_dir != snd_dir);
- const struct inode *fst_inode = d_inode(fst_dentry);
- const struct inode *snd_inode = d_inode(snd_dentry);
void *p;
- dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
- fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
+ //dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
+ // fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
ubifs_assert(ubifs_inode(fst_dir)->data_len == 0);
ubifs_assert(ubifs_inode(snd_dir)->data_len == 0);
ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex));
ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex));
- dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1;
- dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1;
+ dlen1 = UBIFS_DENT_NODE_SZ + fname_len(snd_nm) + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + fname_len(fst_nm) + 1;
aligned_dlen1 = ALIGN(dlen1, 8);
aligned_dlen2 = ALIGN(dlen2, 8);
@@ -963,24 +978,24 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
/* Make new dent for 1st entry */
dent1->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name);
+ dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, snd_nm);
dent1->inum = cpu_to_le64(fst_inode->i_ino);
dent1->type = get_dent_type(fst_inode->i_mode);
- dent1->nlen = cpu_to_le16(snd_dentry->d_name.len);
- memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len);
- dent1->name[snd_dentry->d_name.len] = '\0';
+ dent1->nlen = cpu_to_le16(fname_len(snd_nm));
+ memcpy(dent1->name, fname_name(snd_nm), fname_len(snd_nm));
+ dent1->name[fname_len(snd_nm)] = '\0';
zero_dent_node_unused(dent1);
ubifs_prep_grp_node(c, dent1, dlen1, 0);
/* Make new dent for 2nd entry */
dent2 = (void *)dent1 + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name);
+ dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, fst_nm);
dent2->inum = cpu_to_le64(snd_inode->i_ino);
dent2->type = get_dent_type(snd_inode->i_mode);
- dent2->nlen = cpu_to_le16(fst_dentry->d_name.len);
- memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len);
- dent2->name[fst_dentry->d_name.len] = '\0';
+ dent2->nlen = cpu_to_le16(fname_len(fst_nm));
+ memcpy(dent2->name, fname_name(fst_nm), fname_len(fst_nm));
+ dent2->name[fname_len(fst_nm)] = '\0';
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
@@ -1004,14 +1019,14 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
}
release_head(c, BASEHD);
- dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name);
+ dent_key_init(c, &key, snd_dir->i_ino, snd_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
- dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name);
+ dent_key_init(c, &key, fst_dir->i_ino, fst_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm);
if (err)
goto out_ro;
@@ -1063,31 +1078,31 @@ out_free:
* returned.
*/
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
- const struct dentry *old_dentry,
+ const struct inode *old_inode,
+ const struct fscrypt_name *old_nm,
const struct inode *new_dir,
- const struct dentry *new_dentry,
+ const struct inode *new_inode,
+ const struct fscrypt_name *new_nm,
const struct inode *whiteout, int sync)
{
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
int err, dlen1, dlen2, ilen, lnum, offs, len;
- const struct inode *old_inode = d_inode(old_dentry);
- const struct inode *new_inode = d_inode(new_dentry);
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
struct ubifs_inode *uninitialized_var(new_ui);
- dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
- old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
+ //dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
+ // old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
- dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
- dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+ dlen1 = UBIFS_DENT_NODE_SZ + fname_len(new_nm) + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + fname_len(old_nm) + 1;
if (new_inode) {
new_ui = ubifs_inode(new_inode);
ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
@@ -1113,19 +1128,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
/* Make new dent */
dent->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+ dent_key_init_flash(c, &dent->key, new_dir->i_ino, new_nm);
dent->inum = cpu_to_le64(old_inode->i_ino);
dent->type = get_dent_type(old_inode->i_mode);
- dent->nlen = cpu_to_le16(new_dentry->d_name.len);
- memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
- dent->name[new_dentry->d_name.len] = '\0';
+ dent->nlen = cpu_to_le16(fname_len(new_nm));
+ memcpy(dent->name, fname_name(new_nm), fname_len(new_nm));
+ dent->name[fname_len(new_nm)] = '\0';
+ set_dent_cookie(c, dent);
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen1, 0);
dent2 = (void *)dent + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
- &old_dentry->d_name);
+ dent_key_init_flash(c, &dent2->key, old_dir->i_ino, old_nm);
if (whiteout) {
dent2->inum = cpu_to_le64(whiteout->i_ino);
@@ -1135,9 +1150,10 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
dent2->inum = 0;
dent2->type = DT_UNKNOWN;
}
- dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
- memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
- dent2->name[old_dentry->d_name.len] = '\0';
+ dent2->nlen = cpu_to_le16(fname_len(old_nm));
+ memcpy(dent2->name, fname_name(old_nm), fname_len(old_nm));
+ dent2->name[fname_len(old_nm)] = '\0';
+ set_dent_cookie(c, dent2);
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
@@ -1178,15 +1194,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
}
release_head(c, BASEHD);
- dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+ dent_key_init(c, &key, new_dir->i_ino, new_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
if (whiteout) {
- dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name);
+ dent_key_init(c, &key, old_dir->i_ino, old_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm);
if (err)
goto out_ro;
@@ -1196,8 +1212,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (err)
goto out_ro;
- dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
- err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+ dent_key_init(c, &key, old_dir->i_ino, old_nm);
+ err = ubifs_tnc_remove_nm(c, &key, old_nm);
if (err)
goto out_ro;
}
@@ -1251,35 +1267,60 @@ out_free:
}
/**
- * recomp_data_node - re-compress a truncated data node.
+ * truncate_data_node - re-compress/encrypt a truncated data node.
+ * @c: UBIFS file-system description object
+ * @inode: inode which referes to the data node
+ * @block: data block number
* @dn: data node to re-compress
* @new_len: new length
*
* This function is used when an inode is truncated and the last data node of
- * the inode has to be re-compressed and re-written.
+ * the inode has to be re-compressed/encrypted and re-written.
*/
-static int recomp_data_node(const struct ubifs_info *c,
- struct ubifs_data_node *dn, int *new_len)
+static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
+ unsigned int block, struct ubifs_data_node *dn,
+ int *new_len)
{
void *buf;
- int err, len, compr_type, out_len;
+ int err, dlen, compr_type, out_len, old_dlen;
out_len = le32_to_cpu(dn->size);
buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
if (!buf)
return -ENOMEM;
- len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
- err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type);
- if (err)
- goto out;
- ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, block);
+ if (err)
+ goto out;
+ }
+
+ if (compr_type != UBIFS_COMPR_NONE) {
+ err = ubifs_decompress(c, &dn->data, dlen, buf, &out_len, compr_type);
+ if (err)
+ goto out;
+
+ ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+ }
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+ if (err)
+ goto out;
+
+ out_len = old_dlen;
+ } else {
+ dn->compr_size = 0;
+ }
+
ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
dn->compr_type = cpu_to_le16(compr_type);
dn->size = cpu_to_le32(*new_len);
*new_len = UBIFS_DATA_NODE_SZ + out_len;
+ err = 0;
out:
kfree(buf);
return err;
@@ -1347,17 +1388,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
if (le32_to_cpu(dn->size) <= dlen)
dlen = 0; /* Nothing to do */
else {
- int compr_type = le16_to_cpu(dn->compr_type);
-
- if (compr_type != UBIFS_COMPR_NONE) {
- err = recomp_data_node(c, dn, &dlen);
- if (err)
- goto out_free;
- } else {
- dn->size = cpu_to_le32(dlen);
- dlen += UBIFS_DATA_NODE_SZ;
- }
- zero_data_node_unused(dn);
+ err = truncate_data_node(c, inode, blk, dn, &dlen);
+ if (err)
+ goto out_free;
}
}
}
@@ -1442,7 +1475,8 @@ out_free:
* error code in case of failure.
*/
int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
- const struct inode *inode, const struct qstr *nm)
+ const struct inode *inode,
+ const struct fscrypt_name *nm)
{
int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
struct ubifs_dent_node *xent;
@@ -1451,9 +1485,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
int sync = IS_DIRSYNC(host);
struct ubifs_inode *host_ui = ubifs_inode(host);
- dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
- host->i_ino, inode->i_ino, nm->name,
- ubifs_inode(inode)->data_len);
+ //dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+ // host->i_ino, inode->i_ino, nm->name,
+ // ubifs_inode(inode)->data_len);
ubifs_assert(inode->i_nlink == 0);
ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
@@ -1461,7 +1495,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
* Since we are deleting the inode, we do not bother to attach any data
* to it and assume its length is %UBIFS_INO_NODE_SZ.
*/
- xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ xlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
aligned_xlen = ALIGN(xlen, 8);
hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
@@ -1482,9 +1516,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
key_write(c, &xent_key, xent->key);
xent->inum = 0;
xent->type = get_dent_type(inode->i_mode);
- xent->nlen = cpu_to_le16(nm->len);
- memcpy(xent->name, nm->name, nm->len);
- xent->name[nm->len] = '\0';
+ xent->nlen = cpu_to_le16(fname_len(nm));
+ memcpy(xent->name, fname_name(nm), fname_len(nm));
+ xent->name[fname_len(nm)] = '\0';
zero_dent_node_unused(xent);
ubifs_prep_grp_node(c, xent, xlen, 0);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index c0a95e393347..7547be512db2 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -69,7 +69,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
uint32_t a = 0;
const signed char *str = (const signed char *)s;
- while (*str) {
+ while (len--) {
a += *str << 4;
a += *str >> 4;
a *= 11;
@@ -153,13 +153,13 @@ static inline void highest_ino_key(const struct ubifs_info *c,
* @c: UBIFS file-system description object
* @key: key to initialize
* @inum: parent inode number
- * @nm: direntry name and length
+ * @nm: direntry name and length. Not a string when encrypted!
*/
static inline void dent_key_init(const struct ubifs_info *c,
union ubifs_key *key, ino_t inum,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->u32[0] = inum;
@@ -191,10 +191,11 @@ static inline void dent_key_init_hash(const struct ubifs_info *c,
* @nm: direntry name and length
*/
static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, const struct qstr *nm)
+ ino_t inum,
+ const struct fscrypt_name *nm)
{
union ubifs_key *key = k;
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->j32[0] = cpu_to_le32(inum);
@@ -225,9 +226,9 @@ static inline void lowest_dent_key(const struct ubifs_info *c,
*/
static inline void xent_key_init(const struct ubifs_info *c,
union ubifs_key *key, ino_t inum,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->u32[0] = inum;
@@ -242,10 +243,10 @@ static inline void xent_key_init(const struct ubifs_info *c,
* @nm: extended attribute entry name and length
*/
static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, const struct qstr *nm)
+ ino_t inum, const struct fscrypt_name *nm)
{
union ubifs_key *key = k;
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->j32[0] = cpu_to_le32(inum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index fb0f44cd1e28..ae5c02f22f3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -61,7 +61,7 @@ struct replay_entry {
struct list_head list;
union ubifs_key key;
union {
- struct qstr nm;
+ struct fscrypt_name nm;
struct {
loff_t old_size;
loff_t new_size;
@@ -327,7 +327,7 @@ static void destroy_replay_list(struct ubifs_info *c)
list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
if (is_hash_key(c, &r->key))
- kfree(r->nm.name);
+ kfree(fname_name(&r->nm));
list_del(&r->list);
kfree(r);
}
@@ -430,10 +430,10 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
r->deletion = !!deletion;
r->sqnum = sqnum;
key_copy(c, key, &r->key);
- r->nm.len = nlen;
+ fname_len(&r->nm) = nlen;
memcpy(nbuf, name, nlen);
nbuf[nlen] = '\0';
- r->nm.name = nbuf;
+ fname_name(&r->nm) = nbuf;
list_add_tail(&r->list, &c->replay_list);
return 0;
@@ -456,7 +456,7 @@ int ubifs_validate_entry(struct ubifs_info *c,
if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
dent->type >= UBIFS_ITYPES_CNT ||
nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
- strnlen(dent->name, nlen) != nlen ||
+ (key_type == UBIFS_XENT_KEY && strnlen(dent->name, nlen) != nlen) ||
le64_to_cpu(dent->inum) > MAX_INUM) {
ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ?
"directory entry" : "extended attribute entry");
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3cbb904a6d7d..7f1ead29e727 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -163,6 +163,7 @@ static int create_default_filesystem(struct ubifs_info *c)
tmp64 = (long long)max_buds * c->leb_size;
if (big_lpt)
sup_flags |= UBIFS_FLG_BIGLPT;
+ sup_flags |= UBIFS_FLG_DOUBLE_HASH;
sup->ch.node_type = UBIFS_SB_NODE;
sup->key_hash = UBIFS_KEY_HASH_R5;
@@ -465,6 +466,16 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
goto failed;
}
+ if (!c->double_hash && c->fmt_version >= 5) {
+ err = 16;
+ goto failed;
+ }
+
+ if (c->encrypted && c->fmt_version < 5) {
+ err = 17;
+ goto failed;
+ }
+
return 0;
failed:
@@ -620,6 +631,24 @@ int ubifs_read_superblock(struct ubifs_info *c)
memcpy(&c->uuid, &sup->uuid, 16);
c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
+ c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH);
+ c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION);
+
+ if ((sup_flags & ~UBIFS_FLG_MASK) != 0) {
+ ubifs_err(c, "Unknown feature flags found: %#x",
+ sup_flags & ~UBIFS_FLG_MASK);
+ err = -EINVAL;
+ goto out;
+ }
+
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+ if (c->encrypted) {
+ ubifs_err(c, "file system contains encrypted files but UBIFS"
+ " was built without crypto support.");
+ err = -EINVAL;
+ goto out;
+ }
+#endif
/* Automatically increase file system size to the maximum size */
c->old_leb_cnt = c->leb_cnt;
@@ -807,3 +836,33 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
ubifs_msg(c, "free space fixup complete");
return err;
}
+
+int ubifs_enable_encryption(struct ubifs_info *c)
+{
+ int err;
+ struct ubifs_sb_node *sup;
+
+ if (c->encrypted)
+ return 0;
+
+ if (c->ro_mount || c->ro_media)
+ return -EROFS;
+
+ if (c->fmt_version < 5) {
+ ubifs_err(c, "on-flash format version 5 is needed for encryption");
+ return -EINVAL;
+ }
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION);
+
+ err = ubifs_write_sb_node(c, sup);
+ if (!err)
+ c->encrypted = 1;
+ kfree(sup);
+
+ return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4ec051089186..e08aa04fc835 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -198,7 +198,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
}
memcpy(ui->data, ino->data, ui->data_len);
((char *)ui->data)[ui->data_len] = '\0';
- inode->i_link = ui->data;
break;
case S_IFBLK:
case S_IFCHR:
@@ -380,6 +379,9 @@ out:
}
done:
clear_inode(inode);
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ fscrypt_put_encryption_info(inode, NULL);
+#endif
}
static void ubifs_dirty_inode(struct inode *inode, int flags)
@@ -1207,7 +1209,8 @@ static int mount_ubifs(struct ubifs_info *c)
bu_init(c);
if (!c->ro_mount) {
- c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+ UBIFS_CIPHER_BLOCK_SIZE,
GFP_KERNEL);
if (!c->write_reserve_buf)
goto out_free;
@@ -1620,7 +1623,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
}
- c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL);
if (!c->write_reserve_buf) {
err = -ENOMEM;
goto out;
@@ -1995,6 +1999,12 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+struct fscrypt_operations ubifs_crypt_operations = {
+ .is_encrypted = __ubifs_crypt_is_encrypted,
+};
+#endif
+
static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ubifs_info *c = sb->s_fs_info;
@@ -2041,6 +2051,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
sb->s_xattr = ubifs_xattr_handlers;
+ sb->s_cop = &ubifs_crypt_operations;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa9a20cc60d6..709aa098dd46 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -34,6 +34,11 @@
#include <linux/slab.h>
#include "ubifs.h"
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+ int len, int lnum, int offs);
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_zbranch *zbr, void *node);
+
/*
* Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
* @NAME_LESS: name corresponding to the first argument is less than second
@@ -378,7 +383,7 @@ static void lnc_free(struct ubifs_zbranch *zbr)
}
/**
- * tnc_read_node_nm - read a "hashed" leaf node.
+ * tnc_read_hashed_node - read a "hashed" leaf node.
* @c: UBIFS file-system description object
* @zbr: key and position of the node
* @node: node is returned here
@@ -388,8 +393,8 @@ static void lnc_free(struct ubifs_zbranch *zbr)
* added to LNC. Returns zero in case of success or a negative negative error
* code in case of failure.
*/
-static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
- void *node)
+static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
{
int err;
@@ -402,7 +407,19 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
return 0;
}
- err = ubifs_tnc_read_node(c, zbr, node);
+ if (c->replaying) {
+ err = fallible_read_node(c, &zbr->key, zbr, node);
+ /*
+ * When the node was not found, return -ENOENT, 0 otherwise.
+ * Negative return codes stay as-is.
+ */
+ if (err == 0)
+ err = -ENOENT;
+ else if (err == 1)
+ err = 0;
+ } else {
+ err = ubifs_tnc_read_node(c, zbr, node);
+ }
if (err)
return err;
@@ -519,7 +536,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
* of failure, a negative error code is returned.
*/
static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
struct ubifs_dent_node *dent;
int nlen, err;
@@ -542,11 +559,11 @@ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
dent = zbr->leaf;
nlen = le16_to_cpu(dent->nlen);
- err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
if (err == 0) {
- if (nlen == nm->len)
+ if (nlen == fname_len(nm))
return NAME_MATCHES;
- else if (nlen < nm->len)
+ else if (nlen < fname_len(nm))
return NAME_LESS;
else
return NAME_GREATER;
@@ -689,7 +706,7 @@ static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
*/
static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode **zn, int *n,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int err;
@@ -807,7 +824,7 @@ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
*/
static int fallible_matches_name(struct ubifs_info *c,
struct ubifs_zbranch *zbr,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
struct ubifs_dent_node *dent;
int nlen, err;
@@ -835,11 +852,11 @@ static int fallible_matches_name(struct ubifs_info *c,
dent = zbr->leaf;
nlen = le16_to_cpu(dent->nlen);
- err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
if (err == 0) {
- if (nlen == nm->len)
+ if (nlen == fname_len(nm))
return NAME_MATCHES;
- else if (nlen < nm->len)
+ else if (nlen < fname_len(nm))
return NAME_LESS;
else
return NAME_GREATER;
@@ -878,7 +895,8 @@ out_free:
static int fallible_resolve_collision(struct ubifs_info *c,
const union ubifs_key *key,
struct ubifs_znode **zn, int *n,
- const struct qstr *nm, int adding)
+ const struct fscrypt_name *nm,
+ int adding)
{
struct ubifs_znode *o_znode = NULL, *znode = *zn;
int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
@@ -1453,7 +1471,7 @@ again:
* In this case the leaf node cache gets used, so we pass the
* address of the zbranch and keep the mutex locked
*/
- err = tnc_read_node_nm(c, zt, node);
+ err = tnc_read_hashed_node(c, zt, node);
goto out;
}
if (safely) {
@@ -1782,19 +1800,19 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
* @node: the node is returned here
* @nm: node name
*
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
* Since the hash may have collisions, there may be many nodes with the same
* key, so we have to sequentially look to all of them until the needed one is
* found. This function returns zero in case of success, %-ENOENT if the node
* was not found, and a negative error code in case of failure.
*/
static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm)
+ void *node, const struct fscrypt_name *nm)
{
int found, n, err;
struct ubifs_znode *znode;
- dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
+ //dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1816,7 +1834,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
goto out_unlock;
}
- err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+ err = tnc_read_hashed_node(c, &znode->zbranch[n], node);
out_unlock:
mutex_unlock(&c->tnc_mutex);
@@ -1830,14 +1848,14 @@ out_unlock:
* @node: the node is returned here
* @nm: node name
*
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
* Since the hash may have collisions, there may be many nodes with the same
* key, so we have to sequentially look to all of them until the needed one is
* found. This function returns zero in case of success, %-ENOENT if the node
* was not found, and a negative error code in case of failure.
*/
int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm)
+ void *node, const struct fscrypt_name *nm)
{
int err, len;
const struct ubifs_dent_node *dent = node;
@@ -1851,16 +1869,105 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
return err;
len = le16_to_cpu(dent->nlen);
- if (nm->len == len && !memcmp(dent->name, nm->name, len))
+ if (fname_len(nm) == len && !memcmp(dent->name, fname_name(nm), len))
return 0;
/*
* Unluckily, there are hash collisions and we have to iterate over
* them look at each direntry with colliding name hash sequentially.
*/
+
return do_lookup_nm(c, key, node, nm);
}
+static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_dent_node *dent, uint32_t cookie)
+{
+ int n, err, type = key_type(c, key);
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ union ubifs_key *dkey, start_key;
+
+ ubifs_assert(is_hash_key(c, key));
+
+ lowest_dent_key(c, &start_key, key_inum(c, key));
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, &start_key, &znode, &n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+
+ for (;;) {
+ if (!err) {
+ err = tnc_next(c, &znode, &n);
+ if (err)
+ goto out_unlock;
+ }
+
+ zbr = &znode->zbranch[n];
+ dkey = &zbr->key;
+
+ if (key_inum(c, dkey) != key_inum(c, key) ||
+ key_type(c, dkey) != type) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ err = tnc_read_hashed_node(c, zbr, dent);
+ if (err)
+ goto out_unlock;
+
+ if (key_hash(c, key) == key_hash(c, dkey) &&
+ le32_to_cpu(dent->cookie) == cookie)
+ goto out_unlock;
+ }
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_lookup_dh - look up a "double hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @cookie: node cookie for collision resolution
+ *
+ * This function looks up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one
+ * with the same cookie value is found.
+ * This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, uint32_t cookie)
+{
+ int err;
+ const struct ubifs_dent_node *dent = node;
+
+ if (!c->double_hash)
+ return -EOPNOTSUPP;
+
+ /*
+ * We assume that in most of the cases there are no name collisions and
+ * 'ubifs_tnc_lookup()' returns us the right direntry.
+ */
+ err = ubifs_tnc_lookup(c, key, node);
+ if (err)
+ return err;
+
+ if (le32_to_cpu(dent->cookie) == cookie)
+ return 0;
+
+ /*
+ * Unluckily, there are hash collisions and we have to iterate over
+ * them look at each direntry with colliding name hash sequentially.
+ */
+ return do_lookup_dh(c, key, node, cookie);
+}
+
/**
* correct_parent_keys - correct parent znodes' keys.
* @c: UBIFS file-system description object
@@ -2279,14 +2386,15 @@ out_unlock:
* may have collisions, like directory entry keys.
*/
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len, const struct qstr *nm)
+ int lnum, int offs, int len,
+ const struct fscrypt_name *nm)
{
int found, n, err = 0;
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
- lnum, offs, nm->len, nm->name);
+ //dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+ // lnum, offs, nm->len, nm->name);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2344,7 +2452,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
* by passing 'ubifs_tnc_remove_nm()' the same key but
* an unmatchable name.
*/
- struct qstr noname = { .name = "" };
+ struct fscrypt_name noname = { .disk_name = { .name = "", .len = 1 } };
err = dbg_check_tnc(c, 0);
mutex_unlock(&c->tnc_mutex);
@@ -2514,13 +2622,13 @@ out_unlock:
* Returns %0 on success or negative error code on failure.
*/
int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int n, err;
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
+ //dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
err = lookup_level0_dirty(c, key, &znode, &n);
if (err < 0)
goto out_unlock;
@@ -2669,7 +2777,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
{
union ubifs_key key1, key2;
struct ubifs_dent_node *xent, *pxent = NULL;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
dbg_tnc("ino %lu", (unsigned long)inum);
@@ -2694,8 +2802,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
dbg_tnc("xent '%s', ino %lu", xent->name,
(unsigned long)xattr_inum);
- nm.name = xent->name;
- nm.len = le16_to_cpu(xent->nlen);
+ fname_name(&nm) = xent->name;
+ fname_len(&nm) = le16_to_cpu(xent->nlen);
err = ubifs_tnc_remove_nm(c, &key1, &nm);
if (err) {
kfree(xent);
@@ -2747,7 +2855,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
*/
struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
union ubifs_key *key,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int n, err, type = key_type(c, key);
struct ubifs_znode *znode;
@@ -2755,7 +2863,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
- dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
+ //dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
ubifs_assert(is_hash_key(c, key));
mutex_lock(&c->tnc_mutex);
@@ -2763,10 +2871,14 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
if (unlikely(err < 0))
goto out_unlock;
- if (nm->name) {
+ if (fname_len(nm) > 0) {
if (err) {
/* Handle collisions */
- err = resolve_collision(c, key, &znode, &n, nm);
+ if (c->replaying)
+ err = fallible_resolve_collision(c, key, &znode, &n,
+ nm, 0);
+ else
+ err = resolve_collision(c, key, &znode, &n, nm);
dbg_tnc("rc returned %d, znode %p, n %d",
err, znode, n);
if (unlikely(err < 0))
@@ -2813,7 +2925,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
goto out_free;
}
- err = tnc_read_node_nm(c, zbr, dent);
+ err = tnc_read_hashed_node(c, zbr, dent);
if (unlikely(err))
goto out_free;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index e24380cf46ed..e8c23c9d4f4a 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -46,7 +46,7 @@
* UBIFS went into mainline kernel with format version 4. The older formats
* were development formats.
*/
-#define UBIFS_FORMAT_VERSION 4
+#define UBIFS_FORMAT_VERSION 5
/*
* Read-only compatibility version. If the UBIFS format is changed, older UBIFS
@@ -301,6 +301,13 @@ enum {
#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
/*
+ * xattr name of UBIFS encryption context, we don't use a prefix
+ * nor a long name to not waste space on the flash.
+ */
+#define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
+
+/*
* On-flash inode flags.
*
* UBIFS_COMPR_FL: use compression for this inode
@@ -309,6 +316,7 @@ enum {
* UBIFS_APPEND_FL: writes to the inode may only append data
* UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
* UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ * UBIFS_CRYPT_FL: use encryption for this inode
*
* Note, these are on-flash flags which correspond to ioctl flags
* (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
@@ -321,6 +329,7 @@ enum {
UBIFS_APPEND_FL = 0x08,
UBIFS_DIRSYNC_FL = 0x10,
UBIFS_XATTR_FL = 0x20,
+ UBIFS_CRYPT_FL = 0x40,
};
/* Inode flag bits used by UBIFS */
@@ -409,12 +418,19 @@ enum {
*
* UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
* UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
+ * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to
+ * support 64bit cookies for lookups by hash
+ * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files
*/
enum {
UBIFS_FLG_BIGLPT = 0x02,
UBIFS_FLG_SPACE_FIXUP = 0x04,
+ UBIFS_FLG_DOUBLE_HASH = 0x08,
+ UBIFS_FLG_ENCRYPTION = 0x10,
};
+#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION)
+
/**
* struct ubifs_ch - common header node.
* @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
@@ -521,7 +537,8 @@ struct ubifs_ino_node {
* @padding1: reserved for future, zeroes
* @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
* @nlen: name length
- * @padding2: reserved for future, zeroes
+ * @cookie: A 32bits random number, used to construct a 64bits
+ * identifier.
* @name: zero-terminated name
*
* Note, do not forget to amend 'zero_dent_node_unused()' function when
@@ -534,7 +551,7 @@ struct ubifs_dent_node {
__u8 padding1;
__u8 type;
__le16 nlen;
- __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+ __le32 cookie;
__u8 name[];
} __packed;
@@ -544,18 +561,16 @@ struct ubifs_dent_node {
* @key: node key
* @size: uncompressed data size in bytes
* @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
- * @padding: reserved for future, zeroes
+ * @compr_size: compressed data size in bytes, only valid when data is encrypted
* @data: data
*
- * Note, do not forget to amend 'zero_data_node_unused()' function when
- * changing the padding fields.
*/
struct ubifs_data_node {
struct ubifs_ch ch;
__u8 key[UBIFS_MAX_KEY_LEN];
__le32 size;
__le16 compr_type;
- __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+ __le16 compr_size;
__u8 data[];
} __packed;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 096035eb29d0..ca72382ce6cc 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,6 +38,8 @@
#include <linux/backing-dev.h>
#include <linux/security.h>
#include <linux/xattr.h>
+#include <linux/fscrypto.h>
+#include <linux/random.h>
#include "ubifs-media.h"
/* Version of this UBIFS implementation */
@@ -83,10 +85,6 @@
*/
#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
-/* Write-buffer synchronization timeout interval in seconds */
-#define WBUF_TIMEOUT_SOFTLIMIT 3
-#define WBUF_TIMEOUT_HARDLIMIT 5
-
/* Maximum possible inode number (only 32-bit inodes are supported now) */
#define MAX_INUM 0xFFFFFFFF
@@ -138,6 +136,12 @@
*/
#define WORST_COMPR_FACTOR 2
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
+#else
+#define UBIFS_CIPHER_BLOCK_SIZE 0
+#endif
+
/*
* How much memory is needed for a buffer where we compress a data node.
*/
@@ -645,9 +649,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
* @io_mutex: serializes write-buffer I/O
* @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
* fields
- * @softlimit: soft write-buffer timeout interval
- * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit
- * and @softlimit + @delta)
* @timer: write-buffer timer
* @no_timer: non-zero if this write-buffer does not have a timer
* @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
@@ -676,8 +677,6 @@ struct ubifs_wbuf {
int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
struct mutex io_mutex;
spinlock_t lock;
- ktime_t softlimit;
- unsigned long long delta;
struct hrtimer timer;
unsigned int no_timer:1;
unsigned int need_sync:1;
@@ -1007,6 +1006,8 @@ struct ubifs_debug_info;
*
* @big_lpt: flag that LPT is too big to write whole during commit
* @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
+ * @double_hash: flag indicating that we can do lookups by hash
+ * @encrypted: flag indicating that this file system contains encrypted files
* @no_chk_data_crc: do not check CRCs when reading data nodes (except during
* recovery)
* @bulk_read: enable bulk-reads
@@ -1249,6 +1250,8 @@ struct ubifs_info {
unsigned int big_lpt:1;
unsigned int space_fixup:1;
+ unsigned int double_hash:1;
+ unsigned int encrypted:1;
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
unsigned int default_compr:2;
@@ -1515,25 +1518,29 @@ int ubifs_consolidate_log(struct ubifs_info *c);
/* journal.c */
int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
- const struct qstr *nm, const struct inode *inode,
+ const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent);
int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len);
int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
- const struct dentry *fst_dentry,
+ const struct inode *fst_inode,
+ const struct fscrypt_name *fst_nm,
const struct inode *snd_dir,
- const struct dentry *snd_dentry, int sync);
+ const struct inode *snd_inode,
+ const struct fscrypt_name *snd_nm, int sync);
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
- const struct dentry *old_dentry,
+ const struct inode *old_inode,
+ const struct fscrypt_name *old_nm,
const struct inode *new_dir,
- const struct dentry *new_dentry,
+ const struct inode *new_inode,
+ const struct fscrypt_name *new_nm,
const struct inode *whiteout, int sync);
int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
loff_t old_size, loff_t new_size);
int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
- const struct inode *inode, const struct qstr *nm);
+ const struct inode *inode, const struct fscrypt_name *nm);
int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
const struct inode *inode2);
@@ -1568,7 +1575,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode **zn, int *n);
int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm);
+ void *node, const struct fscrypt_name *nm);
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, uint32_t secondary_hash);
int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
void *node, int *lnum, int *offs);
int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
@@ -1576,16 +1585,16 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
int old_lnum, int old_offs, int lnum, int offs, int len);
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len, const struct qstr *nm);
+ int lnum, int offs, int len, const struct fscrypt_name *nm);
int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
- const struct qstr *nm);
+ const struct fscrypt_name *nm);
int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
union ubifs_key *to_key);
int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
union ubifs_key *key,
- const struct qstr *nm);
+ const struct fscrypt_name *nm);
void ubifs_tnc_close(struct ubifs_info *c);
int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
int lnum, int offs, int is_idx);
@@ -1642,6 +1651,7 @@ int ubifs_read_superblock(struct ubifs_info *c);
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
int ubifs_fixup_free_space(struct ubifs_info *c);
+int ubifs_enable_encryption(struct ubifs_info *c);
/* replay.c */
int ubifs_validate_entry(struct ubifs_info *c,
@@ -1733,16 +1743,21 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
#endif
/* dir.c */
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
umode_t mode);
int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
+int ubifs_check_dir_empty(struct inode *dir);
/* xattr.c */
extern const struct xattr_handler *ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
int ubifs_init_security(struct inode *dentry, struct inode *inode,
const struct qstr *qstr);
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+ size_t size, int flags);
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+ size_t size);
/* super.c */
struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
@@ -1781,6 +1796,66 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
+#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
+static inline int ubifs_encrypt(const struct inode *inode,
+ struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len,
+ int block)
+{
+ ubifs_assert(0);
+ return -EOPNOTSUPP;
+}
+static inline int ubifs_decrypt(const struct inode *inode,
+ struct ubifs_data_node *dn,
+ unsigned int *out_len, int block)
+{
+ ubifs_assert(0);
+ return -EOPNOTSUPP;
+}
+#else
+/* crypto.c */
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len, int block);
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int *out_len, int block);
+#endif
+
+extern struct fscrypt_operations ubifs_crypt_operations;
+
+static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ return ui->flags & UBIFS_CRYPT_FL;
+}
+
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
+{
+ return __ubifs_crypt_is_encrypted((struct inode *)inode);
+}
+
/* Normal UBIFS messages */
__printf(2, 3)
void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index d9f9615bfd71..efe00fcb8b75 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -97,7 +97,7 @@ static const struct file_operations empty_fops;
* of failure.
*/
static int create_xattr(struct ubifs_info *c, struct inode *host,
- const struct qstr *nm, const void *value, int size)
+ const struct fscrypt_name *nm, const void *value, int size)
{
int err, names_len;
struct inode *inode;
@@ -117,7 +117,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
* extended attributes if the name list becomes larger. This limitation
* is artificial for UBIFS, though.
*/
- names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
+ names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1;
if (names_len > XATTR_LIST_MAX) {
ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
host->i_ino, names_len, XATTR_LIST_MAX);
@@ -154,9 +154,18 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
host_ui->xattr_cnt += 1;
- host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
- host_ui->xattr_names += nm->len;
+ host_ui->xattr_names += fname_len(nm);
+
+ /*
+ * We handle UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT here because we
+ * have to set the UBIFS_CRYPT_FL flag on the host inode.
+ * To avoid multiple updates of the same inode in the same operation,
+ * let's do it here.
+ */
+ if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+ host_ui->flags |= UBIFS_CRYPT_FL;
err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
if (err)
@@ -170,9 +179,10 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_cnt -= 1;
- host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
- host_ui->xattr_names -= nm->len;
+ host_ui->xattr_names -= fname_len(nm);
+ host_ui->flags &= ~UBIFS_CRYPT_FL;
mutex_unlock(&host_ui->ui_mutex);
out_free:
make_bad_inode(inode);
@@ -269,22 +279,28 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
return ERR_PTR(-EINVAL);
}
-static int __ubifs_setxattr(struct inode *host, const char *name,
- const void *value, size_t size, int flags)
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
- ubifs_assert(inode_is_locked(host));
+ /*
+ * Creating an encryption context is done unlocked since we
+ * operate on a new inode which is not visible to other users
+ * at this point.
+ */
+ if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) != 0)
+ ubifs_assert(inode_is_locked(host));
if (size > UBIFS_MAX_INO_DATA)
return -ERANGE;
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -329,18 +345,18 @@ out_free:
return err;
}
-static ssize_t __ubifs_getxattr(struct inode *host, const char *name,
- void *buf, size_t size)
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+ size_t size)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_inode *ui;
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -387,6 +403,20 @@ out_unlock:
return err;
}
+static bool xattr_visible(const char *name)
+{
+ /* File encryption related xattrs are for internal use only */
+ if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+ return false;
+
+ /* Show trusted namespace only for "power" users */
+ if (strncmp(name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) == 0 && !capable(CAP_SYS_ADMIN))
+ return false;
+
+ return true;
+}
+
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
union ubifs_key key;
@@ -395,7 +425,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_dent_node *xent, *pxent = NULL;
int err, len, written = 0;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
dentry, size);
@@ -419,15 +449,12 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
break;
}
- nm.name = xent->name;
- nm.len = le16_to_cpu(xent->nlen);
+ fname_name(&nm) = xent->name;
+ fname_len(&nm) = le16_to_cpu(xent->nlen);
- /* Show trusted namespace only for "power" users */
- if (strncmp(xent->name, XATTR_TRUSTED_PREFIX,
- XATTR_TRUSTED_PREFIX_LEN) ||
- capable(CAP_SYS_ADMIN)) {
- memcpy(buffer + written, nm.name, nm.len + 1);
- written += nm.len + 1;
+ if (xattr_visible(xent->name)) {
+ memcpy(buffer + written, fname_name(&nm), fname_len(&nm) + 1);
+ written += fname_len(&nm) + 1;
}
kfree(pxent);
@@ -446,7 +473,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
static int remove_xattr(struct ubifs_info *c, struct inode *host,
- struct inode *inode, const struct qstr *nm)
+ struct inode *inode, const struct fscrypt_name *nm)
{
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
@@ -463,9 +490,9 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
host_ui->xattr_cnt -= 1;
- host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
- host_ui->xattr_names -= nm->len;
+ host_ui->xattr_names -= fname_len(nm);
err = ubifs_jnl_delete_xattr(c, host, inode, nm);
if (err)
@@ -477,27 +504,27 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_cnt += 1;
- host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
- host_ui->xattr_names += nm->len;
+ host_ui->xattr_names += fname_len(nm);
mutex_unlock(&host_ui->ui_mutex);
ubifs_release_budget(c, &req);
make_bad_inode(inode);
return err;
}
-static int __ubifs_removexattr(struct inode *host, const char *name)
+static int ubifs_xattr_remove(struct inode *host, const char *name)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
ubifs_assert(inode_is_locked(host));
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -548,7 +575,8 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0);
+ err = ubifs_xattr_set(inode, name, xattr->value,
+ xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
@@ -572,7 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
return err;
}
-static int ubifs_xattr_get(const struct xattr_handler *handler,
+static int xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
@@ -580,10 +608,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
inode->i_ino, dentry, size);
name = xattr_full_name(handler, name);
- return __ubifs_getxattr(inode, name, buffer, size);
+ return ubifs_xattr_get(inode, name, buffer, size);
}
-static int ubifs_xattr_set(const struct xattr_handler *handler,
+static int xattr_set(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -594,27 +622,27 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
name = xattr_full_name(handler, name);
if (value)
- return __ubifs_setxattr(inode, name, value, size, flags);
+ return ubifs_xattr_set(inode, name, value, size, flags);
else
- return __ubifs_removexattr(inode, name);
+ return ubifs_xattr_remove(inode, name);
}
static const struct xattr_handler ubifs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
static const struct xattr_handler ubifs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
static const struct xattr_handler ubifs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
const struct xattr_handler *ubifs_xattr_handlers[] = {
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index aaec13c95253..2d0e028067eb 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,6 +30,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/bio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 988d5352bdb8..7aa48bd7cbaf 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/string.h>
+#include <linux/bio.h>
struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index aad46401ede5..0f3db71753aa 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
#include <linux/uio.h>
+#include <linux/bio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 67e085d591d8..a0376a2c1c29 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/bitops.h>
+#include <linux/bio.h>
#include <asm/byteorder.h>
#include "ufs_fs.h"
@@ -306,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
(unsigned long long)(pos + newb), pos);
bh->b_blocknr = newb + pos;
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
mark_buffer_dirty(bh);
++j;
bh = bh->b_this_page;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 190d64be22ed..7e41aee7b69a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -25,7 +25,7 @@
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* we do not zeroize fragment, because of
* if it maped to hole, it already contains zeroes
@@ -1192,7 +1191,7 @@ out:
return err;
}
-void ufs_truncate_blocks(struct inode *inode)
+static void ufs_truncate_blocks(struct inode *inode)
{
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f04ab232d08d..131b2b77c818 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -71,7 +71,7 @@
#include <stdarg.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/fs.h>
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..43953e03c356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -63,6 +63,7 @@ struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_t wq;
struct userfaultfd_ctx *ctx;
+ bool waken;
};
struct userfaultfd_wake_range {
@@ -86,6 +87,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
if (len && (start > uwq->msg.arg.pagefault.address ||
start + len <= uwq->msg.arg.pagefault.address))
goto out;
+ WRITE_ONCE(uwq->waken, true);
+ /*
+ * The implicit smp_mb__before_spinlock in try_to_wake_up()
+ * renders uwq->waken visible to other CPUs before the task is
+ * waken.
+ */
ret = wake_up_state(wq->private, mode);
if (ret)
/*
@@ -257,18 +264,19 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
-int handle_userfault(struct fault_env *fe, unsigned long reason)
+int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
- struct mm_struct *mm = fe->vma->vm_mm;
+ struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
bool must_wait, return_to_userland;
+ long blocking_state;
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
- ctx = fe->vma->vm_userfaultfd_ctx.ctx;
+ ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -301,17 +309,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
- if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
+ BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -323,7 +332,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* and wait.
*/
ret = VM_FAULT_RETRY;
- if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
@@ -331,12 +340,15 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(fe->address, fe->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
uwq.ctx = ctx;
+ uwq.waken = false;
return_to_userland =
- (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+ blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE;
spin_lock(&ctx->fault_pending_wqh.lock);
/*
@@ -349,11 +361,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* following the spin_unlock to happen before the list_add in
* __add_wait_queue.
*/
- set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
- TASK_KILLABLE);
+ set_current_state(blocking_state);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+ reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -362,6 +374,29 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
wake_up_poll(&ctx->fd_wqh, POLLIN);
schedule();
ret |= VM_FAULT_MAJOR;
+
+ /*
+ * False wakeups can orginate even from rwsem before
+ * up_read() however userfaults will wait either for a
+ * targeted wakeup on the specific uwq waitqueue from
+ * wake_userfault() or for signals or for uffd
+ * release.
+ */
+ while (!READ_ONCE(uwq.waken)) {
+ /*
+ * This needs the full smp_store_mb()
+ * guarantee as the state write must be
+ * visible to other CPUs before reading
+ * uwq.waken from other CPUs.
+ */
+ set_current_state(blocking_state);
+ if (READ_ONCE(uwq.waken) ||
+ READ_ONCE(ctx->released) ||
+ (return_to_userland ? signal_pending(current) :
+ fatal_signal_pending(current)))
+ break;
+ schedule();
+ }
}
__set_current_state(TASK_RUNNING);
diff --git a/fs/utimes.c b/fs/utimes.c
index 22307cdf7014..32b15b3f6629 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -8,7 +8,7 @@
#include <linux/stat.h>
#include <linux/utime.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/unistd.h>
#ifdef __ARCH_WANT_SYS_UTIME
@@ -48,7 +48,7 @@ static bool nsec_valid(long nsec)
return nsec >= 0 && nsec <= 999999999;
}
-static int utimes_common(struct path *path, struct timespec *times)
+static int utimes_common(const struct path *path, struct timespec *times)
{
int error;
struct iattr newattrs;
diff --git a/fs/xattr.c b/fs/xattr.c
index 2d13b4e62fae..7e3317cf4045 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -22,7 +22,7 @@
#include <linux/vmalloc.h>
#include <linux/posix_acl_xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e5ebc3770460..33db69be4832 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
/*
* Per-AG Block Reservations
@@ -200,22 +201,30 @@ __xfs_ag_resv_init(
struct xfs_mount *mp = pag->pag_mount;
struct xfs_ag_resv *resv;
int error;
+ xfs_extlen_t reserved;
- resv = xfs_perag_resv(pag, type);
if (used > ask)
ask = used;
- resv->ar_asked = ask;
- resv->ar_reserved = resv->ar_orig_reserved = ask - used;
- mp->m_ag_max_usable -= ask;
+ reserved = ask - used;
- trace_xfs_ag_resv_init(pag, type, ask);
-
- error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
- if (error)
+ error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true);
+ if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
+ xfs_warn(mp,
+"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
+ pag->pag_agno);
+ return error;
+ }
- return error;
+ mp->m_ag_max_usable -= ask;
+
+ resv = xfs_perag_resv(pag, type);
+ resv->ar_asked = ask;
+ resv->ar_reserved = resv->ar_orig_reserved = reserved;
+
+ trace_xfs_ag_resv_init(pag, type, ask);
+ return 0;
}
/* Create a per-AG block reservation. */
@@ -223,6 +232,8 @@ int
xfs_ag_resv_init(
struct xfs_perag *pag)
{
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_agnumber_t agno = pag->pag_agno;
xfs_extlen_t ask;
xfs_extlen_t used;
int error = 0;
@@ -231,23 +242,45 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_refcountbt_calc_reserves(pag->pag_mount,
- pag->pag_agno, &ask, &used);
+ error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
- error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
- ask, used);
+ error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error) {
+ /*
+ * Because we didn't have per-AG reservations when the
+ * finobt feature was added we might not be able to
+ * reserve all needed blocks. Warn and fall back to the
+ * old and potentially buggy code in that case, but
+ * ensure we do have the reservation for the refcountbt.
+ */
+ ask = used = 0;
+
+ mp->m_inotbt_nores = true;
+
+ error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
+ &used);
+ if (error)
+ goto out;
+
+ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+ ask, used);
+ if (error)
+ goto out;
+ }
}
/* Create the AGFL metadata reservation */
if (pag->pag_agfl_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
- &ask, &used);
+ error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
@@ -256,6 +289,16 @@ xfs_ag_resv_init(
goto out;
}
+#ifdef DEBUG
+ /* need to read in the AGF for the ASSERT below to work */
+ error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
+ if (error)
+ return error;
+
+ ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+ xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
+ pag->pagf_freeblks + pag->pagf_flcount);
+#endif
out:
return error;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index effb64cf714f..9f06a211e157 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -95,10 +95,7 @@ unsigned int
xfs_alloc_set_aside(
struct xfs_mount *mp)
{
- unsigned int blocks;
-
- blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
- return blocks;
+ return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
}
/*
@@ -365,36 +362,12 @@ xfs_alloc_fix_len(
return;
ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
ASSERT(rlen % args->prod == args->mod);
+ ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+ rlen + args->minleft);
args->len = rlen;
}
/*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
- xfs_alloc_arg_t *args) /* allocation argument structure */
-{
- xfs_agf_t *agf; /* a.g. freelist header */
- int diff; /* free space difference */
-
- if (args->minleft == 0)
- return 1;
- agf = XFS_BUF_TO_AGF(args->agbp);
- diff = be32_to_cpu(agf->agf_freeblks)
- - args->len - args->minleft;
- if (diff >= 0)
- return 1;
- args->len += diff; /* shrink the allocated space */
- /* casts to (int) catch length underflows */
- if ((int)args->len >= (int)args->minlen)
- return 1;
- args->agbno = NULLAGBLOCK;
- return 0;
-}
-
-/*
* Update the two btrees, logically removing from freespace the extent
* starting at rbno, rlen blocks. The extent is contained within the
* actual (current) free extent fbno for flen blocks.
@@ -689,8 +662,6 @@ xfs_alloc_ag_vextent(
xfs_alloc_arg_t *args) /* argument structure for allocation */
{
int error=0;
- xfs_extlen_t reservation;
- xfs_extlen_t oldmax;
ASSERT(args->minlen > 0);
ASSERT(args->maxlen > 0);
@@ -699,20 +670,6 @@ xfs_alloc_ag_vextent(
ASSERT(args->alignment > 0);
/*
- * Clamp maxlen to the amount of free space minus any reservations
- * that have been made.
- */
- oldmax = args->maxlen;
- reservation = xfs_ag_resv_needed(args->pag, args->resv);
- if (args->maxlen > args->pag->pagf_freeblks - reservation)
- args->maxlen = args->pag->pagf_freeblks - reservation;
- if (args->maxlen == 0) {
- args->agbno = NULLAGBLOCK;
- args->maxlen = oldmax;
- return 0;
- }
-
- /*
* Branch to correct routine based on the type.
*/
args->wasfromfl = 0;
@@ -731,8 +688,6 @@ xfs_alloc_ag_vextent(
/* NOTREACHED */
}
- args->maxlen = oldmax;
-
if (error || args->agbno == NULLAGBLOCK)
return error;
@@ -841,9 +796,6 @@ xfs_alloc_ag_vextent_exact(
args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
- args->agbno;
xfs_alloc_fix_len(args);
- if (!xfs_alloc_fix_minleft(args))
- goto not_found;
-
ASSERT(args->agbno + args->len <= tend);
/*
@@ -1149,12 +1101,7 @@ restart:
XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->len = blen;
- if (!xfs_alloc_fix_minleft(args)) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_near_nominleft(args);
- return 0;
- }
- blen = args->len;
+
/*
* We are allocating starting at bnew for blen blocks.
*/
@@ -1346,12 +1293,6 @@ restart:
*/
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
- if (!xfs_alloc_fix_minleft(args)) {
- trace_xfs_alloc_near_nominleft(args);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- return 0;
- }
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
args->datatype, ltbnoa, ltlena, &ltnew);
@@ -1553,8 +1494,6 @@ restart:
}
xfs_alloc_fix_len(args);
- if (!xfs_alloc_fix_minleft(args))
- goto out_nominleft;
rlen = args->len;
XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
/*
@@ -2056,7 +1995,7 @@ xfs_alloc_space_available(
int flags)
{
struct xfs_perag *pag = args->pag;
- xfs_extlen_t longest;
+ xfs_extlen_t alloc_len, longest;
xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
@@ -2066,17 +2005,28 @@ xfs_alloc_space_available(
reservation = xfs_ag_resv_needed(pag, args->resv);
/* do we have enough contiguous free space for the allocation? */
+ alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
reservation);
- if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+ if (longest < alloc_len)
return false;
/* do we have enough free space remaining for the allocation? */
available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
- reservation - min_free - args->total);
- if (available < (int)args->minleft || available <= 0)
+ reservation - min_free - args->minleft);
+ if (available < (int)max(args->total, alloc_len))
return false;
+ /*
+ * Clamp maxlen to the amount of free space available for the actual
+ * extent allocation.
+ */
+ if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+ args->maxlen = available;
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->maxlen >= args->minlen);
+ }
+
return true;
}
@@ -2122,7 +2072,8 @@ xfs_alloc_fix_freelist(
}
need = xfs_alloc_min_freelist(mp, pag);
- if (!xfs_alloc_space_available(args, need, flags))
+ if (!xfs_alloc_space_available(args, need, flags |
+ XFS_ALLOC_FLAG_CHECK))
goto out_agbp_relse;
/*
@@ -2455,12 +2406,15 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
return false;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+ if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
return false;
if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+ (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
return false;
/*
@@ -2477,7 +2431,8 @@ xfs_agf_verify(
return false;
if (xfs_sb_version_hasreflink(&mp->m_sb) &&
- be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+ (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+ be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
return false;
return true;;
@@ -2634,12 +2589,10 @@ xfs_alloc_vextent(
xfs_agblock_t agsize; /* allocation group size */
int error;
int flags; /* XFS_ALLOC_FLAG_... locking flags */
- xfs_extlen_t minleft;/* minimum left value, temp copy */
xfs_mount_t *mp; /* mount structure pointer */
xfs_agnumber_t sagno; /* starting allocation group number */
xfs_alloctype_t type; /* input allocation type */
int bump_rotor = 0;
- int no_min = 0;
xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
mp = args->mp;
@@ -2668,7 +2621,6 @@ xfs_alloc_vextent(
trace_xfs_alloc_vextent_badargs(args);
return 0;
}
- minleft = args->minleft;
switch (type) {
case XFS_ALLOCTYPE_THIS_AG:
@@ -2679,9 +2631,7 @@ xfs_alloc_vextent(
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->pag = xfs_perag_get(mp, args->agno);
- args->minleft = 0;
error = xfs_alloc_fix_freelist(args, 0);
- args->minleft = minleft;
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto error0;
@@ -2746,9 +2696,7 @@ xfs_alloc_vextent(
*/
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
- if (no_min) args->minleft = 0;
error = xfs_alloc_fix_freelist(args, flags);
- args->minleft = minleft;
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto error0;
@@ -2788,20 +2736,17 @@ xfs_alloc_vextent(
* or switch to non-trylock mode.
*/
if (args->agno == sagno) {
- if (no_min == 1) {
+ if (flags == 0) {
args->agbno = NULLAGBLOCK;
trace_xfs_alloc_vextent_allfailed(args);
break;
}
- if (flags == 0) {
- no_min = 1;
- } else {
- flags = 0;
- if (type == XFS_ALLOCTYPE_START_BNO) {
- args->agbno = XFS_FSB_TO_AGBNO(mp,
- args->fsbno);
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- }
+
+ flags = 0;
+ if (type == XFS_ALLOCTYPE_START_BNO) {
+ args->agbno = XFS_FSB_TO_AGBNO(mp,
+ args->fsbno);
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
}
}
xfs_perag_put(args->pag);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7c404a6b0ae3..1d0f48a501a3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t;
#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */
#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */
-
+#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */
/*
* Argument structure for xfs_alloc routines.
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 5ba2dac5e67c..efb467b10a71 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -421,13 +421,17 @@ xfs_allocbt_init_cursor(
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_allocbt_ops;
+ if (btnum == XFS_BTNUM_BNO)
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ else
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
if (btnum == XFS_BTNUM_CNT) {
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index af1ecb19121e..6622d46ddec3 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -131,9 +131,6 @@ xfs_attr_get(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (!xfs_inode_hasattr(ip))
- return -ENOATTR;
-
error = xfs_attr_args_init(&args, ip, name, flags);
if (error)
return error;
@@ -392,9 +389,6 @@ xfs_attr_remove(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- if (!xfs_inode_hasattr(dp))
- return -ENOATTR;
-
error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8ea91f363093..2852521fc8ec 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -253,6 +253,7 @@ xfs_attr3_leaf_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_perag *pag = bp->b_pag;
struct xfs_attr3_icleaf_hdr ichdr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -273,7 +274,12 @@ xfs_attr3_leaf_verify(
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
}
- if (ichdr.count == 0)
+ /*
+ * In recovery there is a transient state where count == 0 is valid
+ * because we may have transitioned an empty shortform attr to a leaf
+ * if the attr didn't fit in shortform.
+ */
+ if (pag && pag->pagf_init && ichdr.count == 0)
return false;
/* XXX: need to range check rest of attr header values */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 4f2aed04f827..f7dda0c237b0 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
-int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
@@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+void xfs_attr3_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c6eb21940783..bfc00de5c6f1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -49,6 +49,8 @@
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -190,8 +192,12 @@ xfs_bmap_worst_indlen(
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
+ xfs_filblks_t orig_len;
mp = ip->i_mount;
+
+ /* Calculate the worst-case size of the bmbt. */
+ orig_len = len;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -199,12 +205,20 @@ xfs_bmap_worst_indlen(
len += maxrecs - 1;
do_div(len, maxrecs);
rval += len;
- if (len == 1)
- return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ if (len == 1) {
+ rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
level - 1;
+ break;
+ }
if (level == 0)
maxrecs = mp->m_bmap_dmxr[1];
}
+
+ /* Calculate the worst-case size of the rmapbt. */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
+ mp->m_rmap_maxlevels;
+
return rval;
}
@@ -504,7 +518,7 @@ void
xfs_bmap_trace_exlist(
xfs_inode_t *ip, /* incore inode pointer */
xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr fork */
+ int whichfork, /* data or attr or cow fork */
unsigned long caller_ip)
{
xfs_extnum_t idx; /* extent record index */
@@ -513,11 +527,13 @@ xfs_bmap_trace_exlist(
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
+ else if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(cnt == xfs_iext_count(ifp));
for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+ trace_xfs_extlist(ip, idx, state, caller_ip);
}
/*
@@ -811,7 +827,7 @@ try_another_ag:
XFS_BTREE_LONG_PTRS);
arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (cnt = i = 0; i < nextents; i++) {
ep = xfs_iext_get_ext(ifp, i);
if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
@@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork(
goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
+ if (ip->i_d.di_anextents != 0) {
+ error = -EFSCORRUPTED;
+ goto trans_cancel;
+ }
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
/*
* For inodes coming from pre-6.2 filesystems.
@@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork(
ASSERT(ip->i_d.di_aformat == 0);
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
}
- ASSERT(ip->i_d.di_anextents == 0);
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1296,7 +1315,7 @@ xfs_bmap_read_extents(
/*
* Here with bp and block set to the leftmost leaf node in the tree.
*/
- room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ room = xfs_iext_count(ifp);
i = 0;
/*
* Loop over all leaf nodes. Copy information to the extent records.
@@ -1361,8 +1380,9 @@ xfs_bmap_read_extents(
return error;
block = XFS_BUF_TO_BLOCK(bp);
}
- ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
+ return -EFSCORRUPTED;
+ ASSERT(i == xfs_iext_count(ifp));
XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
return 0;
error0:
@@ -1370,97 +1390,6 @@ error0:
return -EFSCORRUPTED;
}
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t lastx; /* last extent index */
-
- /*
- * Initialize the extent entry structure to catch access to
- * uninitialized br_startblock field.
- */
- gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
- gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
- gotp->br_state = XFS_EXT_INVALID;
- gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
- prevp->br_startoff = NULLFILEOFF;
-
- ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
- if (lastx > 0) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
- }
- if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, gotp);
- *eofp = 0;
- } else {
- if (lastx > 0) {
- *gotp = *prevp;
- }
- *eofp = 1;
- ep = NULL;
- }
- *lastxp = lastx;
- return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int fork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
-
- XFS_STATS_INC(ip->i_mount, xs_look_exlist);
- ifp = XFS_IFORK_PTR(ip, fork);
-
- ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
- if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
- !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x lastx: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)gotp->br_startblock,
- (unsigned long long)gotp->br_startoff,
- (unsigned long long)gotp->br_blockcount,
- gotp->br_state, *lastxp);
- *lastxp = NULLEXTNUM;
- *eofp = 1;
- return NULL;
- }
- return ep;
-}
-
/*
* Returns the file-relative block number of the first unused block(s)
* in the file with at least "len" logically contiguous blocks free.
@@ -1497,7 +1426,7 @@ xfs_bmap_first_unused(
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
lowest = *first_unused;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
off = xfs_bmbt_get_startoff(ep);
@@ -1523,44 +1452,44 @@ xfs_bmap_first_unused(
*/
int /* error */
xfs_bmap_last_before(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
{
- xfs_fileoff_t bno; /* input file offset */
- int eof; /* hit end of file */
- xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_bmbt_irec_t got; /* current extent value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent used */
- xfs_bmbt_irec_t prev; /* previous extent value */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
+ int error;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return -EIO;
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_LOCAL:
*last_block = 0;
return 0;
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ return -EIO;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- bno = *last_block - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- if (eof || xfs_bmbt_get_startoff(ep) > bno) {
- if (prev.br_startoff == NULLFILEOFF)
- *last_block = 0;
- else
- *last_block = prev.br_startoff + prev.br_blockcount;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
}
- /*
- * Otherwise *last_block is already the right answer.
- */
+
+ if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
+ if (got.br_startoff <= *last_block - 1)
+ return 0;
+ }
+
+ if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
+ *last_block = got.br_startoff + got.br_blockcount;
+ return 0;
+ }
+
+ *last_block = 0;
return 0;
}
@@ -1582,7 +1511,7 @@ xfs_bmap_last_extent(
return error;
}
- nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
*is_empty = 1;
return 0;
@@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real(
&bma->ip->i_d.di_nextents);
ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < xfs_iext_count(ifp) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
@@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real(
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
ASSERT(*idx >= 0);
- ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < xfs_iext_count(&ip->i_df) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
@@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
@@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real(
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (bma->idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
if (isnullstartblock(right.br_startblock))
@@ -3700,7 +3629,7 @@ xfs_bmap_btalloc(
align = xfs_get_cowextsz_hint(ap->ip);
else if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
- if (unlikely(align)) {
+ if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
&ap->offset, &ap->length);
@@ -3772,7 +3701,7 @@ xfs_bmap_btalloc(
args.minlen = ap->minlen;
}
/* apply extent size hints if obtained earlier */
- if (unlikely(align)) {
+ if (align) {
args.prod = align;
if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
@@ -3883,7 +3812,6 @@ xfs_bmap_btalloc(
args.fsbno = 0;
args.type = XFS_ALLOCTYPE_FIRST_AG;
args.total = ap->minlen;
- args.minleft = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
ap->dfops->dop_low = true;
@@ -4145,12 +4073,11 @@ xfs_bmapi_read(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec prev;
xfs_fileoff_t obno;
xfs_fileoff_t end;
- xfs_extnum_t lastx;
+ xfs_extnum_t idx;
int error;
- int eof;
+ bool eof = false;
int n = 0;
int whichfork = xfs_bmapi_whichfork(flags);
@@ -4190,7 +4117,8 @@ xfs_bmapi_read(
return error;
}
- xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+ eof = true;
end = bno + len;
obno = bno;
@@ -4221,10 +4149,8 @@ xfs_bmapi_read(
break;
/* Else go on to the next record. */
- if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
- else
- eof = 1;
+ if (!xfs_iext_get_extent(ifp, ++idx, &got))
+ eof = true;
}
*nmap = n;
return 0;
@@ -4234,10 +4160,10 @@ int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
- xfs_fileoff_t aoff,
+ xfs_fileoff_t off,
xfs_filblks_t len,
+ xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *prev,
xfs_extnum_t *lastx,
int eof)
{
@@ -4248,10 +4174,17 @@ xfs_bmapi_reserve_delalloc(
char rt = XFS_IS_REALTIME_INODE(ip);
xfs_extlen_t extsz;
int error;
+ xfs_fileoff_t aoff = off;
- alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
/* Figure out the extent size, adjust alen */
if (whichfork == XFS_COW_FORK)
@@ -4259,7 +4192,12 @@ xfs_bmapi_reserve_delalloc(
else
extsz = xfs_get_extsz_hint(ip);
if (extsz) {
- error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+ struct xfs_bmbt_irec prev;
+
+ if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
@@ -4312,6 +4250,16 @@ xfs_bmapi_reserve_delalloc(
*/
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
ASSERT(got->br_startoff <= aoff);
ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
ASSERT(isnullstartblock(got->br_startblock));
@@ -4349,7 +4297,7 @@ xfs_bmapi_allocate(
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- if (bma->idx != NULLEXTNUM && bma->idx) {
+ if (bma->idx) {
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
&bma->prev);
}
@@ -4395,8 +4343,6 @@ xfs_bmapi_allocate(
if (error)
return error;
- if (bma->dfops->dop_low)
- bma->minleft = 0;
if (bma->cur)
bma->cur->bc_private.b.firstblock = *bma->firstblock;
if (bma->blkno == NULLFSBLOCK)
@@ -4563,13 +4509,11 @@ xfs_bmapi_write(
struct xfs_ifork *ifp;
struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
- int eof; /* after the end of extents */
+ bool eof = false; /* after the end of extents */
int error; /* error return */
int n; /* current extent index */
xfs_fileoff_t obno; /* old block number (offset) */
int whichfork; /* data or attr fork */
- char inhole; /* current location is hole in file */
- char wasdelay; /* old extent was delayed */
#ifdef DEBUG
xfs_fileoff_t orig_bno; /* original block number value */
@@ -4641,12 +4585,14 @@ xfs_bmapi_write(
goto error0;
}
- xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
- &bma.prev);
n = 0;
end = bno + len;
obno = bno;
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+ eof = true;
+ if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
bma.tp = tp;
bma.ip = ip;
bma.total = total;
@@ -4655,22 +4601,44 @@ xfs_bmapi_write(
bma.firstblock = firstblock;
while (bno < end && n < *nmap) {
- inhole = eof || bma.got.br_startoff > bno;
- wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+ bool need_alloc = false, wasdelay = false;
- /*
- * Make sure we only reflink into a hole.
- */
- if (flags & XFS_BMAPI_REMAP)
- ASSERT(inhole);
- if (flags & XFS_BMAPI_COWFORK)
- ASSERT(!inhole);
+ /* in hole or beyoned EOF? */
+ if (eof || bma.got.br_startoff > bno) {
+ if (flags & XFS_BMAPI_DELALLOC) {
+ /*
+ * For the COW fork we can reasonably get a
+ * request for converting an extent that races
+ * with other threads already having converted
+ * part of it, as there converting COW to
+ * regular blocks is not protected using the
+ * IOLOCK.
+ */
+ ASSERT(flags & XFS_BMAPI_COWFORK);
+ if (!(flags & XFS_BMAPI_COWFORK)) {
+ error = -EIO;
+ goto error0;
+ }
+
+ if (eof || bno >= end)
+ break;
+ } else {
+ need_alloc = true;
+ }
+ } else {
+ /*
+ * Make sure we only reflink into a hole.
+ */
+ ASSERT(!(flags & XFS_BMAPI_REMAP));
+ if (isnullstartblock(bma.got.br_startblock))
+ wasdelay = true;
+ }
/*
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if (inhole || wasdelay) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4733,11 +4701,8 @@ xfs_bmapi_write(
/* Else go on to the next record. */
bma.prev = bma.got;
- if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
- &bma.got);
- } else
- eof = 1;
+ if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+ eof = true;
}
*nmap = n;
@@ -4885,7 +4850,7 @@ xfs_bmap_del_extent_delay(
da_new = 0;
ASSERT(*idx >= 0);
- ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -4902,8 +4867,11 @@ xfs_bmap_del_extent_delay(
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+ error = xfs_trans_reserve_quota_nblks(NULL, ip,
+ -((long)del->br_blockcount), 0,
isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ return error;
ip->i_delayed_blks -= del->br_blockcount;
if (whichfork == XFS_COW_FORK)
@@ -5013,7 +4981,7 @@ xfs_bmap_del_extent_cow(
got_endoff = got->br_startoff + got->br_blockcount;
ASSERT(*idx >= 0);
- ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -5119,8 +5087,7 @@ xfs_bmap_del_extent(
state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
ASSERT(del->br_blockcount > 0);
ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &got);
@@ -5434,8 +5401,6 @@ __xfs_bunmapi(
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_irec_t del; /* extent being deleted */
- int eof; /* is deleting at eof */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
xfs_extnum_t extno; /* extent number in list */
xfs_bmbt_irec_t got; /* current extent record */
@@ -5445,8 +5410,6 @@ __xfs_bunmapi(
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_bmbt_irec_t prev; /* previous extent record */
xfs_fileoff_t start; /* first file offset deleted */
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
@@ -5477,8 +5440,7 @@ __xfs_bunmapi(
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
+ if (xfs_iext_count(ifp) == 0) {
*rlen = 0;
return 0;
}
@@ -5486,18 +5448,17 @@ __xfs_bunmapi(
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
/*
* Check to see if the given block number is past the end of the
* file, back up to the last block if so...
*/
- if (eof) {
- ep = xfs_iext_get_ext(ifp, --lastx);
- xfs_bmbt_get_all(ep, &got);
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
+ ASSERT(lastx > 0);
+ xfs_iext_get_extent(ifp, --lastx, &got);
bno = got.br_startoff + got.br_blockcount - 1;
}
+
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
@@ -5528,8 +5489,7 @@ __xfs_bunmapi(
if (got.br_startoff > bno) {
if (--lastx < 0)
break;
- ep = xfs_iext_get_ext(ifp, lastx);
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, lastx, &got);
}
/*
* Is the last block of this extent before the range
@@ -5543,7 +5503,6 @@ __xfs_bunmapi(
* Then deal with the (possibly delayed) allocated space
* we found.
*/
- ASSERT(ep != NULL);
del = got;
wasdel = isnullstartblock(del.br_startblock);
if (got.br_startoff < start) {
@@ -5624,15 +5583,12 @@ __xfs_bunmapi(
*/
ASSERT(bno >= del.br_blockcount);
bno -= del.br_blockcount;
- if (got.br_startoff > bno) {
- if (--lastx >= 0) {
- ep = xfs_iext_get_ext(ifp,
- lastx);
- xfs_bmbt_get_all(ep, &got);
- }
- }
+ if (got.br_startoff > bno && --lastx >= 0)
+ xfs_iext_get_extent(ifp, lastx, &got);
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec prev;
+
/*
* This one is already unwritten.
* It must have a written left neighbor.
@@ -5640,8 +5596,7 @@ __xfs_bunmapi(
* try again.
*/
ASSERT(lastx > 0);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
- lastx - 1), &prev);
+ xfs_iext_get_extent(ifp, lastx - 1, &prev);
ASSERT(prev.br_state == XFS_EXT_NORM);
ASSERT(!isnullstartblock(prev.br_startblock));
ASSERT(del.br_startblock ==
@@ -5739,13 +5694,9 @@ nodelete:
*/
if (bno != (xfs_fileoff_t)-1 && bno >= start) {
if (lastx >= 0) {
- ep = xfs_iext_get_ext(ifp, lastx);
- if (xfs_bmbt_get_startoff(ep) > bno) {
- if (--lastx >= 0)
- ep = xfs_iext_get_ext(ifp,
- lastx);
- }
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, lastx, &got);
+ if (got.br_startoff > bno && --lastx >= 0)
+ xfs_iext_get_extent(ifp, lastx, &got);
}
extno++;
}
@@ -5963,7 +5914,7 @@ xfs_bmse_shift_one(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
xfs_bmbt_get_all(gotp, &got);
@@ -6140,7 +6091,7 @@ xfs_bmap_shift_extents(
* are collapsing out, so we cannot use the count of real extents here.
* Instead we have to calculate it from the incore fork.
*/
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
if (total_extents == 0) {
*done = 1;
goto del_cursor;
@@ -6200,7 +6151,7 @@ xfs_bmap_shift_extents(
* count can change. Update the total and grade the next record.
*/
if (direction == SHIFT_LEFT) {
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
stop_extent = total_extents;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 7cae6ec27fa6..cdef87db5262 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -110,6 +110,9 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
+/* Only convert delalloc space, don't allocate entirely new extents */
+#define XFS_BMAPI_DELALLOC 0x400
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -120,7 +123,8 @@ struct xfs_extent_free_item
{ XFS_BMAPI_CONVERT, "CONVERT" }, \
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
- { XFS_BMAPI_COWFORK, "COWFORK" }
+ { XFS_BMAPI_COWFORK, "COWFORK" }, \
+ { XFS_BMAPI_DELALLOC, "DELALLOC" }
static inline int xfs_bmapi_aflag(int w)
@@ -237,14 +241,9 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
-struct xfs_bmbt_rec_host *
- xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
- int fork, int *eofp, xfs_extnum_t *lastxp,
- struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t aoff, xfs_filblks_t len,
- struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
- xfs_extnum_t *lastx, int eof);
+ xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 8007d2ba9aef..d9be241fc86f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -502,12 +502,11 @@ try_another_ag:
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
- * a full btree split. Try again without minleft and if
+ * a full btree split. Try again and if
* successful activate the lowspace algorithm.
*/
args.fsbno = 0;
args.type = XFS_ALLOCTYPE_FIRST_AG;
- args.minleft = 0;
error = xfs_alloc_vextent(&args);
if (error)
goto error0;
@@ -796,13 +795,14 @@ xfs_bmbt_init_cursor(
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
cur->bc_btnum = XFS_BTNUM_BMAP;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 0e80993c8a59..21e6a6ab6b9a 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block(
if (error)
return error;
+ /* Check the inode owner since the verifiers don't. */
+ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ cur->bc_private.b.ip->i_ino)
+ goto out_bad;
+
+ /* Did we get the level we were looking for? */
+ if (be16_to_cpu((*blkp)->bb_level) != level)
+ goto out_bad;
+
+ /* Check that internal nodes have at least one record. */
+ if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+ goto out_bad;
+
xfs_btree_setbuf(cur, level, bp);
return 0;
+
+out_bad:
+ *blkp = NULL;
+ xfs_trans_brelse(cur->bc_tp, bp);
+ return -EFSCORRUPTED;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index c2b01d1c79ee..b69b947c4c1b 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -96,46 +96,10 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(mp, type, stat) \
- XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
#define XFS_BTREE_STATS_INC(cur, stat) \
-do { \
- struct xfs_mount *__mp = cur->bc_mp; \
- switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
- case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
- case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
- } \
-} while (0)
-
-#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
- XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
-#define XFS_BTREE_STATS_ADD(cur, stat, val) \
-do { \
- struct xfs_mount *__mp = cur->bc_mp; \
- switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: \
- __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
- case XFS_BTNUM_CNT: \
- __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: \
- __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
- case XFS_BTNUM_INO: \
- __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
- case XFS_BTNUM_FINO: \
- __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
- case XFS_BTNUM_RMAP: \
- __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
- case XFS_BTNUM_REFC: \
- __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
- } \
-} while (0)
+ XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
@@ -253,6 +217,7 @@ typedef struct xfs_btree_cur
__uint8_t bc_nlevels; /* number of levels in the tree */
__uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
+ int bc_statoff; /* offset of btre stats array */
union {
struct { /* needed for BNO, CNT, INO */
struct xfs_buf *agbp; /* agf/agi buffer pointer */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..a416c7cb23ea 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -6,10 +6,11 @@
/*
* Calculate the intermediate checksum for a buffer that has the CRC field
* inside it. The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
*/
static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
{
__uint32_t zero = 0;
__uint32_t crc;
@@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
}
/*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline __uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ /* zero the CRC field */
+ *(__le32 *)(buffer + cksum_offset) = 0;
+
+ /* single pass CRC calculation for the entire buffer */
+ return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
* Convert the intermediate checksum to the final ondisk format.
*
* The CRC32c calculation uses LE format even on BE machines, but returns the
@@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc)
/*
* Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
*/
static inline void
xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+ __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
}
@@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
static inline int
xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+ __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 20a96dd5af7e..2f389d366e93 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -36,21 +36,29 @@
struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
/*
- * @mode, if set, indicates that the type field needs to be set up.
- * This uses the transformation from file mode to DT_* as defined in linux/fs.h
- * for file type specification. This will be propagated into the directory
- * structure if appropriate for the given operation and filesystem config.
+ * Convert inode mode to directory entry filetype
*/
-const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
- [0] = XFS_DIR3_FT_UNKNOWN,
- [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
- [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
-};
+unsigned char xfs_mode_to_ftype(int mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ return XFS_DIR3_FT_REG_FILE;
+ case S_IFDIR:
+ return XFS_DIR3_FT_DIR;
+ case S_IFCHR:
+ return XFS_DIR3_FT_CHRDEV;
+ case S_IFBLK:
+ return XFS_DIR3_FT_BLKDEV;
+ case S_IFIFO:
+ return XFS_DIR3_FT_FIFO;
+ case S_IFSOCK:
+ return XFS_DIR3_FT_SOCK;
+ case S_IFLNK:
+ return XFS_DIR3_FT_SYMLINK;
+ default:
+ return XFS_DIR3_FT_UNKNOWN;
+ }
+}
/*
* ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -93,7 +101,7 @@ xfs_ascii_ci_compname(
return result;
}
-static struct xfs_nameops xfs_ascii_ci_nameops = {
+static const struct xfs_nameops xfs_ascii_ci_nameops = {
.hashname = xfs_ascii_ci_hashname,
.compname = xfs_ascii_ci_compname,
};
@@ -631,7 +639,8 @@ xfs_dir2_isblock(
if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+ if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+ return -EFSCORRUPTED;
*vp = rval;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index becc926c3e3d..d6e6d9d16f6c 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,6 +18,9 @@
#ifndef __XFS_DIR2_H__
#define __XFS_DIR2_H__
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+
struct xfs_defer_ops;
struct xfs_da_args;
struct xfs_inode;
@@ -32,10 +35,9 @@ struct xfs_dir2_data_unused;
extern struct xfs_name xfs_name_dotdot;
/*
- * directory filetype conversion tables.
+ * Convert inode mode to directory entry filetype
*/
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
+extern unsigned char xfs_mode_to_ftype(int mode);
/*
* directory operations vector for encode/decode routines
@@ -157,6 +159,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
+extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
+ const struct xfs_dir_ops *ops,
+ struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
@@ -177,6 +182,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
struct xfs_dir2_data_unused *dup);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 725fc7841fde..d478065b9544 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -329,7 +329,7 @@ xfs_dir3_data_read(
err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
@@ -505,8 +505,9 @@ xfs_dir2_data_freeremove(
* Given a data block, reconstruct its bestfree map.
*/
void
-xfs_dir2_data_freescan(
- struct xfs_inode *dp,
+xfs_dir2_data_freescan_int(
+ struct xfs_da_geometry *geo,
+ const struct xfs_dir_ops *ops,
struct xfs_dir2_data_hdr *hdr,
int *loghead)
{
@@ -516,7 +517,6 @@ xfs_dir2_data_freescan(
struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
- struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -526,13 +526,13 @@ xfs_dir2_data_freescan(
/*
* Start by clearing the table.
*/
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = ops->data_bestfree_p(hdr);
memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)dp->d_ops->data_entry_p(hdr);
+ p = (char *)ops->data_entry_p(hdr);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
btp = xfs_dir2_block_tail_p(geo, hdr);
@@ -559,12 +559,22 @@ xfs_dir2_data_freescan(
else {
dep = (xfs_dir2_data_entry_t *)p;
ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
- p += dp->d_ops->data_entsize(dep->namelen);
+ be16_to_cpu(*ops->data_entry_tag_p(dep)));
+ p += ops->data_entsize(dep->namelen);
}
}
}
+void
+xfs_dir2_data_freescan(
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
+{
+ return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
+ hdr, loghead);
+}
+
/*
* Initialize a data block at the given block number in the directory.
* Give back the buffer for the created block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index ef9f6ead96a4..d04547fcf274 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -21,7 +21,6 @@
struct dir_context;
/* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 51b4e0de1fdc..f272abff11e1 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2344,7 +2344,8 @@ xfs_imap(
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
- imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+ imap->im_boffset = (unsigned short)(offset <<
+ mp->m_sb.sb_inodelog);
return 0;
}
@@ -2372,7 +2373,7 @@ out_map:
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
- imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+ imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
/*
* If the inode number maps to a block outside the bounds
@@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi(
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
/*
* Compute byte offsets for the first and last fields in the first
* region and log the agi buffer. This only logs up through
@@ -2512,8 +2511,15 @@ xfs_agi_verify(
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return false;
- if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ if (be32_to_cpu(agi->agi_level) < 1 ||
+ be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ return false;
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ (be32_to_cpu(agi->agi_free_level) < 1 ||
+ be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
return false;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2592,6 +2598,8 @@ xfs_read_agi(
XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
+ if (tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index eab68ae2e011..7c471881c9a6 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
}
STATIC int
-xfs_inobt_alloc_block(
+__xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int *stat)
+ int *stat,
+ enum xfs_ag_resv_type resv)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
args.maxlen = 1;
args.prod = 1;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.resv = resv;
error = xfs_alloc_vextent(&args);
if (error) {
@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
}
STATIC int
+xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
+{
+ return __xfs_inobt_alloc_block(cur, start, new, stat,
+ XFS_AG_RESV_METADATA);
+}
+
+STATIC int
xfs_inobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_finobt_set_root,
- .alloc_block = xfs_inobt_alloc_block,
+ .alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_inobt_free_block,
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
@@ -357,7 +380,7 @@ xfs_inobt_init_cursor(
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
@@ -365,9 +388,11 @@ xfs_inobt_init_cursor(
if (btnum == XFS_BTNUM_INO) {
cur->bc_nlevels = be32_to_cpu(agi->agi_level);
cur->bc_ops = &xfs_inobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
} else {
cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
cur->bc_ops = &xfs_finobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
}
cur->bc_blocklog = mp->m_sb.sb_blocklog;
@@ -478,3 +503,64 @@ xfs_inobt_rec_check_count(
return 0;
}
#endif /* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+ struct xfs_mount *mp)
+{
+ /* Bail out if we're uninitialized, which can happen in mkfs. */
+ if (mp->m_inobt_mxr[0] == 0)
+ return 0;
+
+ return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
+ (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
+ XFS_INODES_PER_CHUNK);
+}
+
+static int
+xfs_inobt_count_blocks(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+ if (error)
+ return error;
+
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
+ error = xfs_btree_count_blocks(cur, tree_blocks);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+
+ return error;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *ask,
+ xfs_extlen_t *used)
+{
+ xfs_extlen_t tree_len = 0;
+ int error;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return 0;
+
+ error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
+ if (error)
+ return error;
+
+ *ask += xfs_inobt_max_size(mp);
+ *used += tree_len;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453217ce..aa81e2e63f3f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#define xfs_inobt_rec_check_count(mp, rec) 0
#endif /* DEBUG */
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_extlen_t *ask, xfs_extlen_t *used);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 134424fac434..d93f9d918cfc 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -29,6 +29,7 @@
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
/*
* Check that none of the inode's in the buffer have a next
@@ -383,15 +384,28 @@ xfs_log_dinode_to_disk(
static bool
xfs_dinode_verify(
struct xfs_mount *mp,
- struct xfs_inode *ip,
+ xfs_ino_t ino,
struct xfs_dinode *dip)
{
+ uint16_t mode;
uint16_t flags;
uint64_t flags2;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return false;
+ /* don't allow invalid i_size */
+ if (be64_to_cpu(dip->di_size) & (1ULL << 63))
+ return false;
+
+ mode = be16_to_cpu(dip->di_mode);
+ if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
+ return false;
+
+ /* No zero-length symlinks/dirs. */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0)
+ return false;
+
/* only version 3 or greater inodes are extensively verified here */
if (dip->di_version < 3)
return true;
@@ -401,7 +415,7 @@ xfs_dinode_verify(
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
return false;
- if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+ if (be64_to_cpu(dip->di_ino) != ino)
return false;
if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
@@ -436,7 +450,7 @@ xfs_dinode_calc_crc(
return;
ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
- crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF);
dip->di_crc = xfs_end_cksum(crc);
}
@@ -493,7 +507,7 @@ xfs_iread(
return error;
/* even unallocated inodes are verified */
- if (!xfs_dinode_verify(mp, ip, dip)) {
+ if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
xfs_alert(mp, "%s: validation failed for inode %lld failed",
__func__, ip->i_ino);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 3cfe12a4f58a..6848a0afbce7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -58,8 +58,8 @@ struct xfs_icdinode {
*/
struct xfs_imap {
xfs_daddr_t im_blkno; /* starting BB of inode chunk */
- ushort im_len; /* length in BBs of inode chunk */
- ushort im_boffset; /* inode offset in block in bytes */
+ unsigned short im_len; /* length in BBs of inode chunk */
+ unsigned short im_boffset; /* inode offset in block in bytes */
};
int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5dd56d3dbb3a..222e103356c6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -775,6 +775,13 @@ xfs_idestroy_fork(
}
}
+/* Count number of incore extents based on if_bytes */
+xfs_extnum_t
+xfs_iext_count(struct xfs_ifork *ifp)
+{
+ return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+}
+
/*
* Convert in-core extents to on-disk form
*
@@ -803,7 +810,7 @@ xfs_iextents_copy(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(ifp->if_bytes > 0);
- nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nrecs = xfs_iext_count(ifp);
XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
ASSERT(nrecs > 0);
@@ -941,7 +948,7 @@ xfs_iext_get_ext(
xfs_extnum_t idx) /* index of target extent */
{
ASSERT(idx >= 0);
- ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(idx < xfs_iext_count(ifp));
if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
return ifp->if_u1.if_ext_irec->er_extbuf;
@@ -1017,7 +1024,7 @@ xfs_iext_add(
int new_size; /* size of extents after adding */
xfs_extnum_t nextents; /* number of extents in file */
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT((idx >= 0) && (idx <= nextents));
byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
new_size = ifp->if_bytes + byte_diff;
@@ -1241,7 +1248,7 @@ xfs_iext_remove(
trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
ASSERT(ext_diff > 0);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
if (new_size == 0) {
@@ -1270,7 +1277,7 @@ xfs_iext_remove_inline(
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
ASSERT(idx < XFS_INLINE_EXTS);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(((nextents - ext_diff) > 0) &&
(nextents - ext_diff) < XFS_INLINE_EXTS);
@@ -1309,7 +1316,7 @@ xfs_iext_remove_direct(
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
new_size = ifp->if_bytes -
(ext_diff * sizeof(xfs_bmbt_rec_t));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (new_size == 0) {
xfs_iext_destroy(ifp);
@@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct(
int size; /* size of file extents */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(nextents <= XFS_LINEAR_EXTS);
size = nextents * sizeof(xfs_bmbt_rec_t);
@@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext(
xfs_extnum_t nextents; /* number of file extents */
xfs_fileoff_t startoff = 0; /* start offset of extent */
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
*idxp = 0;
return NULL;
@@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec(
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
ASSERT(page_idx >= 0);
- ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
- ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+ ASSERT(page_idx <= xfs_iext_count(ifp));
+ ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
erp_idx = 0;
@@ -1782,7 +1789,7 @@ xfs_iext_irec_init(
xfs_extnum_t nextents; /* number of extents in file */
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(nextents <= XFS_LINEAR_EXTS);
erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
@@ -1906,7 +1913,7 @@ xfs_iext_irec_compact(
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
xfs_iext_destroy(ifp);
@@ -1996,3 +2003,49 @@ xfs_ifork_init_cow(
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent index in *idx.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its index in *idx
+ * instead.
+ * If bno is beyond the last extent return false, and return the index after
+ * the last valid index in *idxp.
+ */
+bool
+xfs_iext_lookup_extent(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t bno,
+ xfs_extnum_t *idxp,
+ struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_bmbt_rec_host *ep;
+
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+ ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
+ if (!ep)
+ return false;
+ xfs_bmbt_get_all(ep, gotp);
+ return true;
+}
+
+/*
+ * Return true if there is an extent at index idx, and return the expanded
+ * extent structure at idx in that case. Else return false.
+ */
+bool
+xfs_iext_get_extent(
+ struct xfs_ifork *ifp,
+ xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp)
+{
+ if (idx < 0 || idx >= xfs_iext_count(ifp))
+ return false;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index c9476f50e32d..7fb8365326d1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
struct xfs_bmbt_rec_host *
xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+xfs_extnum_t xfs_iext_count(struct xfs_ifork *);
void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
struct xfs_bmbt_irec *, int);
void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
@@ -181,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *);
void xfs_iext_irec_compact_full(struct xfs_ifork *);
void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+bool xfs_iext_lookup_extent(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t bno,
+ xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
+bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp);
+
extern struct kmem_zone *xfs_ifork_zone;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 083cdd6d6c28..7ae571f8e34a 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version)
typedef struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
unsigned short blf_size; /* size of this item */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
+ unsigned short blf_flags; /* misc state */
+ unsigned short blf_len; /* number of blocks in this buf */
__int64_t blf_blkno; /* starting blkno of this buf */
unsigned int blf_map_size; /* used size of data bitmap in words */
unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 8e385f91d660..d9f65e2d5cc8 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -52,7 +52,7 @@ typedef struct xlog_recover {
struct list_head r_itemq; /* q for items */
} xlog_recover_t;
-#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
/*
* This is the number of entries in the l_buf_cancel_table used during
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 453bb2757ec2..50add5272807 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor(
cur->bc_btnum = XFS_BTNUM_REFC;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_refcountbt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
@@ -408,13 +409,14 @@ xfs_refcountbt_calc_size(
*/
xfs_extlen_t
xfs_refcountbt_max_size(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ xfs_agblock_t agblocks)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_refc_mxr[0] == 0)
return 0;
- return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+ return xfs_refcountbt_calc_size(mp, agblocks);
}
/*
@@ -429,22 +431,24 @@ xfs_refcountbt_calc_reserves(
{
struct xfs_buf *agbp;
struct xfs_agf *agf;
+ xfs_agblock_t agblocks;
xfs_extlen_t tree_len;
int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
- *ask += xfs_refcountbt_max_size(mp);
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
agf = XFS_BUF_TO_AGF(agbp);
+ agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_buf_relse(agbp);
+ *ask += xfs_refcountbt_max_size(mp, agblocks);
*used += tree_len;
return error;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 3be7768bd51a..9db008b955b7 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -66,7 +66,8 @@ extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
+ xfs_agblock_t agblocks);
extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 83e672ff7577..74e5a54bc428 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_rmapbt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
@@ -549,13 +550,14 @@ xfs_rmapbt_calc_size(
*/
xfs_extlen_t
xfs_rmapbt_max_size(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ xfs_agblock_t agblocks)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_rmap_mxr[0] == 0)
return 0;
- return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+ return xfs_rmapbt_calc_size(mp, agblocks);
}
/*
@@ -570,25 +572,24 @@ xfs_rmapbt_calc_reserves(
{
struct xfs_buf *agbp;
struct xfs_agf *agf;
- xfs_extlen_t pool_len;
+ xfs_agblock_t agblocks;
xfs_extlen_t tree_len;
int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
- /* Reserve 1% of the AG or enough for 1 block per record. */
- pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
- *ask += pool_len;
-
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
agf = XFS_BUF_TO_AGF(agbp);
+ agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_buf_relse(agbp);
+ /* Reserve 1% of the AG or enough for 1 block per record. */
+ *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
*used += tree_len;
return error;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 2a9ac472fb15..19c08e933049 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -60,7 +60,8 @@ extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
+ xfs_agblock_t agblocks);
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e2e1106c9fad..ea45584a9913 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,4 +1016,3 @@ xfs_rtfree_extent(
}
return 0;
}
-
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a70aec910626..584ec896a533 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -242,7 +242,7 @@ xfs_mount_validate_sb(
sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
- sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG ||
+ sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
@@ -262,6 +262,12 @@ xfs_mount_validate_sb(
return -EFSCORRUPTED;
}
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+ xfs_notice(mp, "v5 SB sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
/*
* Until this is fixed only page-sized or smaller data blocks work.
*/
@@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
- if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
+ if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+ sbp->sb_gquotino != NULLFSINO) {
/*
* In older version of superblock, on-disk superblock only
* has sb_gquotino, and in-core superblock has both sb_gquotino
* and sb_pquotino. But, only one of them is supported at any
* point of time. So, if PQUOTA is set in disk superblock,
- * copy over sb_gquotino to sb_pquotino.
+ * copy over sb_gquotino to sb_pquotino. The NULLFSINO test
+ * above is to make sure we don't do this twice and wipe them
+ * both out!
*/
sbp->sb_pquotino = sbp->sb_gquotino;
sbp->sb_gquotino = NULLFSINO;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8d74870468c2..717909f2f7b7 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
-#define NULLEXTNUM ((xfs_extnum_t)-1)
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
@@ -75,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
*/
#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..631e7c0e0a29 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -37,11 +37,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
-/* flags for direct write completions */
-#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
-#define XFS_DIO_FLAG_APPEND (1 << 1)
-#define XFS_DIO_FLAG_COW (1 << 2)
-
/*
* structure owned by writepages passed to individual writepage calls
*/
@@ -495,8 +490,8 @@ xfs_submit_ioend(
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
@@ -567,8 +562,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
@@ -777,7 +771,7 @@ xfs_map_cow(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_bmbt_irec imap;
- bool is_cow = false, need_alloc = false;
+ bool is_cow = false;
int error;
/*
@@ -795,7 +789,7 @@ xfs_map_cow(
* Else we need to check if there is a COW mapping at this offset.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+ is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!is_cow)
@@ -805,7 +799,7 @@ xfs_map_cow(
* And if the COW mapping has a delayed extent here we need to
* allocate real space for it now.
*/
- if (need_alloc) {
+ if (isnullstartblock(imap.br_startblock)) {
error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
&imap);
if (error)
@@ -1158,63 +1152,27 @@ xfs_vm_releasepage(
* block_invalidatepage() can send pages that are still marked dirty
* but otherwise have invalidated buffers.
*
- * We've historically freed buffers on the latter. Instead, quietly
- * filter out all dirty pages to avoid spurious buffer state warnings.
- * This can likely be removed once shrink_active_list() is fixed.
+ * We want to release the latter to avoid unnecessary buildup of the
+ * LRU, skip the former and warn if we've left any lingering
+ * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
+ * or unwritten buffers and warn if the page is not dirty. Otherwise
+ * try to release the buffers.
*/
- if (PageDirty(page))
- return 0;
-
xfs_count_page_state(page, &delalloc, &unwritten);
- if (WARN_ON_ONCE(delalloc))
+ if (delalloc) {
+ WARN_ON_ONCE(!PageDirty(page));
return 0;
- if (WARN_ON_ONCE(unwritten))
+ }
+ if (unwritten) {
+ WARN_ON_ONCE(!PageDirty(page));
return 0;
+ }
return try_to_free_buffers(page);
}
/*
- * When we map a DIO buffer, we may need to pass flags to
- * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- *
- * Note that for DIO, an IO to the highest supported file block offset (i.e.
- * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
- * bit variable. Hence if we see this overflow, we have to assume that the IO is
- * extending the file size. We won't know for sure until IO completion is run
- * and the actual max write offset is communicated to the IO completion
- * routine.
- */
-static void
-xfs_map_direct(
- struct inode *inode,
- struct buffer_head *bh_result,
- struct xfs_bmbt_irec *imap,
- xfs_off_t offset,
- bool is_cow)
-{
- uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
- xfs_off_t size = bh_result->b_size;
-
- trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
- ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
- XFS_IO_OVERWRITE, imap);
-
- if (ISUNWRITTEN(imap)) {
- *flags |= XFS_DIO_FLAG_UNWRITTEN;
- set_buffer_defer_completion(bh_result);
- } else if (is_cow) {
- *flags |= XFS_DIO_FLAG_COW;
- set_buffer_defer_completion(bh_result);
- }
- if (offset + size > i_size_read(inode) || offset + size < 0) {
- *flags |= XFS_DIO_FLAG_APPEND;
- set_buffer_defer_completion(bh_result);
- }
-}
-
-/*
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
* is, so that we can avoid repeated get_blocks calls.
*
@@ -1254,52 +1212,12 @@ xfs_map_trim_size(
bh_result->b_size = mapping_size;
}
-/* Bounce unaligned directio writes to the page cache. */
static int
-xfs_bounce_unaligned_dio_write(
- struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_bmbt_irec irec;
- xfs_fileoff_t delta;
- bool shared;
- bool x;
- int error;
-
- irec = *imap;
- if (offset_fsb > irec.br_startoff) {
- delta = offset_fsb - irec.br_startoff;
- irec.br_blockcount -= delta;
- irec.br_startblock += delta;
- irec.br_startoff = offset_fsb;
- }
- error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
- if (error)
- return error;
-
- /*
- * We're here because we're trying to do a directio write to a
- * region that isn't aligned to a filesystem block. If any part
- * of the extent is shared, fall back to buffered mode to handle
- * the RMW. This is done by returning -EREMCHG ("remote addr
- * changed"), which is caught further up the call stack.
- */
- if (shared) {
- trace_xfs_reflink_bounce_dio_write(ip, imap);
- return -EREMCHG;
- }
- return 0;
-}
-
-STATIC int
-__xfs_get_blocks(
+xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create,
- bool direct,
- bool dax_fault)
+ int create)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1310,11 +1228,8 @@ __xfs_get_blocks(
int nimaps = 1;
xfs_off_t offset;
ssize_t size;
- int new = 0;
- bool is_cow = false;
- bool need_alloc = false;
- BUG_ON(create && !direct);
+ BUG_ON(create);
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1323,7 +1238,7 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && offset >= i_size_read(inode))
+ if (offset >= i_size_read(inode))
return 0;
/*
@@ -1338,52 +1253,12 @@ __xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- if (create && direct && xfs_is_reflink_inode(ip))
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
- &need_alloc);
- if (!is_cow) {
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
- &imap, &nimaps, XFS_BMAPI_ENTIRE);
- /*
- * Truncate an overwrite extent if there's a pending CoW
- * reservation before the end of this extent. This
- * forces us to come back to get_blocks to take care of
- * the CoW.
- */
- if (create && direct && nimaps &&
- imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
- xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
- &imap);
- }
- ASSERT(!need_alloc);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
if (error)
goto out_unlock;
- /* for DAX, we convert unwritten extents directly */
- if (create &&
- (!nimaps ||
- (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK) ||
- (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
-
- trace_xfs_get_blocks_alloc(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_DELALLOC, &imap);
- } else if (nimaps) {
+ if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_OVERWRITE, &imap);
@@ -1393,12 +1268,6 @@ __xfs_get_blocks(
goto out_unlock;
}
- if (IS_DAX(inode) && create) {
- ASSERT(!ISUNWRITTEN(&imap));
- /* zeroing is not needed at a higher layer */
- new = 0;
- }
-
/* trim mapping down to size requested */
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
@@ -1408,50 +1277,14 @@ __xfs_get_blocks(
*/
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
- (create || !ISUNWRITTEN(&imap))) {
- if (create && direct && !is_cow) {
- error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
- &imap);
- if (error)
- return error;
- }
-
+ !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
- if (ISUNWRITTEN(&imap))
- set_buffer_unwritten(bh_result);
- /* direct IO needs special help */
- if (create) {
- if (dax_fault)
- ASSERT(!ISUNWRITTEN(&imap));
- else
- xfs_map_direct(inode, bh_result, &imap, offset,
- is_cow);
- }
- }
/*
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
- /*
- * If we previously allocated a block out beyond eof and we are now
- * coming back to use it then we will need to flag it as new even if it
- * has a disk address.
- *
- * With sub-block writes into unwritten extents we also need to mark
- * the buffer as new so that the unwritten parts of the buffer gets
- * correctly zeroed.
- */
- if (create &&
- ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
- (offset >= i_size_read(inode)) ||
- (new || ISUNWRITTEN(&imap))))
- set_buffer_new(bh_result);
-
- BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-
return 0;
out_unlock:
@@ -1459,110 +1292,6 @@ out_unlock:
return error;
}
-int
-xfs_get_blocks(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
-}
-
-int
-xfs_get_blocks_direct(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
-}
-
-int
-xfs_get_blocks_dax_fault(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
-}
-
-/*
- * Complete a direct I/O write request.
- *
- * xfs_map_direct passes us some flags in the private data to tell us what to
- * do. If no flags are set, then the write IO is an overwrite wholly within
- * the existing allocated file size and so there is nothing for us to do.
- *
- * Note that in this case the completion can be called in interrupt context,
- * whereas if we have flags set we will always be called in task context
- * (i.e. from a workqueue).
- */
-int
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- uintptr_t flags = (uintptr_t)private;
- int error = 0;
-
- trace_xfs_end_io_direct_write(ip, offset, size);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- if (size <= 0)
- return size;
-
- /*
- * The flags tell us whether we are doing unwritten extent conversions
- * or an append transaction that updates the on-disk file size. These
- * cases are the only cases where we should *potentially* be needing
- * to update the VFS inode size.
- */
- if (flags == 0) {
- ASSERT(offset + size <= i_size_read(inode));
- return 0;
- }
-
- /*
- * We need to update the in-core inode size here so that we don't end up
- * with the on-disk inode size being outside the in-core inode size. We
- * have no other method of updating EOF for AIO, so always do it here
- * if necessary.
- *
- * We need to lock the test/set EOF update as we can be racing with
- * other IO completions here to update the EOF. Failing to serialise
- * here can result in EOF moving backwards and Bad Things Happen when
- * that occurs.
- */
- spin_lock(&ip->i_flags_lock);
- if (offset + size > i_size_read(inode))
- i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
-
- if (flags & XFS_DIO_FLAG_COW)
- error = xfs_reflink_end_cow(ip, offset, size);
- if (flags & XFS_DIO_FLAG_UNWRITTEN) {
- trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
- error = xfs_iomap_write_unwritten(ip, offset, size);
- }
- if (flags & XFS_DIO_FLAG_APPEND) {
- trace_xfs_end_io_direct_write_append(ip, offset, size);
-
- error = xfs_setfilesize(ip, offset, size);
- }
-
- return error;
-}
-
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@@ -1583,7 +1312,6 @@ xfs_vm_bmap(
struct xfs_inode *ip = XFS_I(inode);
trace_xfs_vm_bmap(XFS_I(inode));
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1591,12 +1319,10 @@ xfs_vm_bmap(
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error..
*/
- if (xfs_is_reflink_inode(ip)) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (xfs_is_reflink_inode(ip))
return 0;
- }
+
filemap_write_and_wait(mapping);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b3c6634f9518..cc174ec6c2fd 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -55,15 +55,6 @@ struct xfs_ioend {
extern const struct address_space_operations xfs_address_space_operations;
-int xfs_get_blocks(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-
-int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private);
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e3da5d448bcf..d14691aa02b4 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern {
*========================================================================*/
-/* Return 0 on success, or -errno; other state communicated via *context */
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
typedef struct xfs_attr_list_context {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 25e76cd6c053..97c45b6eb91e 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
xfs_attr_sf_entry_t *sfe;
xfs_inode_t *dp;
int sbsize, nsbuf, count, i;
- int error;
ASSERT(context != NULL);
dp = context->dp;
@@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
(XFS_ISRESET_CURSOR(cursor) &&
(dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
- error = context->put_listent(context,
- sfe->flags,
- sfe->nameval,
- (int)sfe->namelen,
- (int)sfe->valuelen);
- if (error)
- return error;
+ context->put_listent(context,
+ sfe->flags,
+ sfe->nameval,
+ (int)sfe->namelen,
+ (int)sfe->valuelen);
/*
* Either search callback finished early or
* didn't fit it all in the buffer after all.
@@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
cursor->hashval = sbp->hash;
cursor->offset = 0;
}
- error = context->put_listent(context,
- sbp->flags,
- sbp->name,
- sbp->namelen,
- sbp->valuelen);
- if (error) {
- kmem_free(sbuf);
- return error;
- }
+ context->put_listent(context,
+ sbp->flags,
+ sbp->name,
+ sbp->namelen,
+ sbp->valuelen);
if (context->seen_enough)
break;
cursor->offset++;
@@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
for (;;) {
leaf = bp->b_addr;
- error = xfs_attr3_leaf_list_int(bp, context);
- if (error) {
- xfs_trans_brelse(NULL, bp);
- return error;
- }
+ xfs_attr3_leaf_list_int(bp, context);
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
if (context->seen_enough || leafhdr.forw == 0)
break;
@@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
/*
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
-int
+void
xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
@@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int(
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
struct xfs_attr_leaf_entry *entry;
- int retval;
int i;
struct xfs_mount *mp = context->dp->i_mount;
@@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int(
}
if (i == ichdr.count) {
trace_xfs_attr_list_notfound(context);
- return 0;
+ return;
}
} else {
entry = &entries[0];
@@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int(
/*
* We have found our place, start copying out the new attributes.
*/
- retval = 0;
for (; i < ichdr.count; entry++, i++) {
char *name;
int namelen, valuelen;
@@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int(
valuelen = be32_to_cpu(name_rmt->valuelen);
}
- retval = context->put_listent(context, entry->flags,
+ context->put_listent(context, entry->flags,
name, namelen, valuelen);
- if (retval)
- break;
if (context->seen_enough)
break;
cursor->offset++;
}
trace_xfs_attr_list_leaf_end(context);
- return retval;
+ return;
}
/*
@@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
if (error)
return error;
- error = xfs_attr3_leaf_list_int(bp, context);
+ xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(NULL, bp);
- return error;
+ return 0;
}
int
@@ -513,7 +498,7 @@ xfs_attr_list_int(
* Take care to check values and protect against them changing later,
* we may be reading them directly out of a user buffer.
*/
-STATIC int
+STATIC void
xfs_attr_put_listent(
xfs_attr_list_context_t *context,
int flags,
@@ -536,10 +521,10 @@ xfs_attr_put_listent(
*/
if (((context->flags & ATTR_SECURE) == 0) !=
((flags & XFS_ATTR_SECURE) == 0))
- return 0;
+ return;
if (((context->flags & ATTR_ROOT) == 0) !=
((flags & XFS_ATTR_ROOT) == 0))
- return 0;
+ return;
arraytop = sizeof(*alist) +
context->count * sizeof(alist->al_offset[0]);
@@ -548,7 +533,7 @@ xfs_attr_put_listent(
trace_xfs_attr_list_full(context);
alist->al_more = 1;
context->seen_enough = 1;
- return 0;
+ return;
}
aep = (attrlist_ent_t *)&context->alist[context->firstu];
@@ -558,7 +543,7 @@ xfs_attr_put_listent(
alist->al_offset[context->count++] = context->firstu;
alist->al_count = context->count;
trace_xfs_attr_list_add(context);
- return 0;
+ return;
}
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 552465e011ec..c1417919ab0a 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -359,9 +359,7 @@ xfs_bmap_count_blocks(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- xfs_bmap_count_leaves(ifp, 0,
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count);
+ xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count);
return 0;
}
@@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole(
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!moretocome &&
xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
- (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ (lastx == xfs_iext_count(ifp) - 1))
out->bmv_oflags |= BMV_OF_LAST;
}
@@ -530,7 +528,6 @@ xfs_getbmap(
xfs_bmbt_irec_t *map; /* buffer for user's data */
xfs_mount_t *mp; /* file system mount point */
int nex; /* # of user extents can do */
- int nexleft; /* # of user extents left */
int subnex; /* # of bmapi's can do */
int nmap; /* number of map entries */
struct getbmapx *out; /* output structure */
@@ -688,10 +685,8 @@ xfs_getbmap(
goto out_free_map;
}
- nexleft = nex;
-
do {
- nmap = (nexleft > subnex) ? subnex : nexleft;
+ nmap = (nex> subnex) ? subnex : nex;
error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
XFS_BB_TO_FSB(mp, bmv->bmv_length),
map, &nmap, bmapi_flags);
@@ -699,8 +694,8 @@ xfs_getbmap(
goto out_free_map;
ASSERT(nmap <= subnex);
- for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
- cur_ext < bmv->bmv_count; i++) {
+ for (i = 0; i < nmap && bmv->bmv_length &&
+ cur_ext < bmv->bmv_count - 1; i++) {
out[cur_ext].bmv_oflags = 0;
if (map[i].br_state == XFS_EXT_UNWRITTEN)
out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -762,16 +757,27 @@ xfs_getbmap(
continue;
}
+ /*
+ * In order to report shared extents accurately,
+ * we report each distinct shared/unshared part
+ * of a single bmbt record using multiple bmap
+ * extents. To make that happen, we iterate the
+ * same map array item multiple times, each
+ * time trimming out the subextent that we just
+ * reported.
+ *
+ * Because of this, we must check the out array
+ * index (cur_ext) directly against bmv_count-1
+ * to avoid overflows.
+ */
if (inject_map.br_startblock != NULLFSBLOCK) {
map[i] = inject_map;
i--;
- } else
- nexleft--;
+ }
bmv->bmv_entries++;
cur_ext++;
}
- } while (nmap && nexleft && bmv->bmv_length &&
- cur_ext < bmv->bmv_count);
+ } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
out_free_map:
kmem_free(map);
@@ -1792,6 +1798,7 @@ xfs_swap_extent_forks(
struct xfs_ifork tempifp, *ifp, *tifp;
int aforkblks = 0;
int taforkblks = 0;
+ xfs_extnum_t nextents;
__uint64_t tmp;
int error;
@@ -1877,14 +1884,13 @@ xfs_swap_extent_forks(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
+ /*
+ * If the extents fit in the inode, fix the pointer. Otherwise
+ * it's already NULL or pointing to the extent.
*/
- if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- ifp->if_u1.if_extents =
- ifp->if_u2.if_inline_ext;
- }
+ nextents = xfs_iext_count(&ip->i_df);
+ if (nextents <= XFS_INLINE_EXTS)
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1896,14 +1902,13 @@ xfs_swap_extent_forks(
switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
+ /*
+ * If the extents fit in the inode, fix the pointer. Otherwise
+ * it's already NULL or pointing to the extent.
*/
- if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- tifp->if_u1.if_extents =
- tifp->if_u2.if_inline_ext;
- }
+ nextents = xfs_iext_count(&tip->i_df);
+ if (nextents <= XFS_INLINE_EXTS)
+ tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1938,8 +1943,8 @@ xfs_swap_extents(
* page cache safely. Once we have done this we can take the ilocks and
* do the rest of the checks.
*/
- lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+ lock_flags = XFS_MMAPLOCK_EXCL;
xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
/* Verify that both files have the same format */
@@ -2079,15 +2084,13 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
+out_unlock:
xfs_iunlock(ip, lock_flags);
xfs_iunlock(tip, lock_flags);
+ unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
-
-out_unlock:
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
- return error;
+ goto out_unlock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5b9bffe3520..ac3b4db519df 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -219,7 +219,6 @@ _xfs_buf_alloc(
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
- RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
@@ -423,6 +422,7 @@ retry:
out_free_pages:
for (i = 0; i < bp->b_page_count; i++)
__free_page(bp->b_pages[i]);
+ bp->b_flags &= ~_XBF_PAGES;
return error;
}
@@ -473,6 +473,62 @@ _xfs_buf_map_pages(
/*
* Finding and Reading Buffers
*/
+static int
+_xfs_buf_obj_cmp(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct xfs_buf_map *map = arg->key;
+ const struct xfs_buf *bp = obj;
+
+ /*
+ * The key hashing in the lookup path depends on the key being the
+ * first element of the compare_arg, make sure to assert this.
+ */
+ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+ if (bp->b_bn != map->bm_bn)
+ return 1;
+
+ if (unlikely(bp->b_length != map->bm_len)) {
+ /*
+ * found a block number match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching for an exact match.
+ */
+ ASSERT(bp->b_flags & XBF_STALE);
+ return 1;
+ }
+ return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+ .min_size = 32, /* empty AGs have minimal footprint */
+ .nelem_hint = 16,
+ .key_len = sizeof(xfs_daddr_t),
+ .key_offset = offsetof(struct xfs_buf, b_bn),
+ .head_offset = offsetof(struct xfs_buf, b_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+ struct xfs_perag *pag)
+{
+ spin_lock_init(&pag->pag_buf_lock);
+ return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+ struct xfs_perag *pag)
+{
+ rhashtable_destroy(&pag->pag_buf_hash);
+}
/*
* Look up, and creates if absent, a lockable buffer for
@@ -488,27 +544,24 @@ _xfs_buf_find(
xfs_buf_t *new_bp)
{
struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent;
xfs_buf_t *bp;
- xfs_daddr_t blkno = map[0].bm_bn;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int numblks = 0;
int i;
for (i = 0; i < nmaps; i++)
- numblks += map[i].bm_len;
+ cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (blkno < 0 || blkno >= eofs) {
+ if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
/*
* XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
@@ -516,53 +569,29 @@ _xfs_buf_find(
*/
xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
- __func__, blkno, eofs);
+ __func__, cmap.bm_bn, eofs);
WARN_ON(1);
return NULL;
}
- /* get tree root */
pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, blkno));
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
- /* walk tree */
spin_lock(&pag->pag_buf_lock);
- rbp = &pag->pag_buf_tree.rb_node;
- parent = NULL;
- bp = NULL;
- while (*rbp) {
- parent = *rbp;
- bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
- if (blkno < bp->b_bn)
- rbp = &(*rbp)->rb_left;
- else if (blkno > bp->b_bn)
- rbp = &(*rbp)->rb_right;
- else {
- /*
- * found a block number match. If the range doesn't
- * match, the only way this is allowed is if the buffer
- * in the cache is stale and the transaction that made
- * it stale has not yet committed. i.e. we are
- * reallocating a busy extent. Skip this buffer and
- * continue searching to the right for an exact match.
- */
- if (bp->b_length != numblks) {
- ASSERT(bp->b_flags & XBF_STALE);
- rbp = &(*rbp)->rb_right;
- continue;
- }
- atomic_inc(&bp->b_hold);
- goto found;
- }
+ bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+ xfs_buf_hash_params);
+ if (bp) {
+ atomic_inc(&bp->b_hold);
+ goto found;
}
/* No match found */
if (new_bp) {
- rb_link_node(&new_bp->b_rbnode, parent, rbp);
- rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
/* the buffer keeps the perag reference until it is freed */
new_bp->b_pag = pag;
+ rhashtable_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
} else {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +959,6 @@ xfs_buf_rele(
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
- ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
@@ -938,8 +966,6 @@ xfs_buf_rele(
return;
}
- ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
ASSERT(atomic_read(&bp->b_hold) > 0);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1009,8 @@ xfs_buf_rele(
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
freebuf = true;
@@ -1304,7 +1331,7 @@ _xfs_buf_ioapply(
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
if (bp->b_flags & XBF_SYNCIO)
- op_flags = WRITE_SYNC;
+ op_flags = REQ_SYNC;
if (bp->b_flags & XBF_FUA)
op_flags |= REQ_FUA;
if (bp->b_flags & XBF_FLUSH)
@@ -1711,8 +1738,7 @@ xfs_free_buftarg(
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- if (mp->m_flags & XFS_MOUNT_BARRIER)
- xfs_blkdev_issue_flush(btp);
+ xfs_blkdev_issue_flush(btp);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1c2e52b2d926..8a9d3a9599f0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
+ { XBF_NO_IOACCT, "NO_IOACCT" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
@@ -150,7 +151,7 @@ typedef struct xfs_buf {
* which is the only bit that is touched if we hit the semaphore
* fast-path on locking.
*/
- struct rb_node b_rbnode; /* rbtree node */
+ struct rhash_head b_rhash_head; /* pag buffer hash node */
xfs_daddr_t b_bn; /* block number of buffer */
int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 29816981b50a..003a99b83bd8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -677,7 +677,6 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -686,7 +685,6 @@ xfs_readdir(
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return rval;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7a30b8f11db7..9d06cc30e875 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -710,6 +710,10 @@ xfs_dq_get_next_id(
/* Simple advance */
next_id = *id + 1;
+ /* If we'd wrap past the max ID, stop */
+ if (next_id < *id)
+ return -ENOENT;
+
/* If new ID is within the current chunk, advancing it sufficed */
if (next_id % mp->m_quotainfo->qi_dqperchunk) {
*id = next_id;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6e4f7f900fea..bbb9eb6811b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -48,40 +48,6 @@
static const struct vm_operations_struct xfs_file_vm_ops;
/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
- struct xfs_inode *ip,
- int type)
-{
- if (type & XFS_IOLOCK_EXCL)
- inode_lock(VFS_I(ip));
- xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
- struct xfs_inode *ip,
- int type)
-{
- xfs_iunlock(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
- struct xfs_inode *ip,
- int type)
-{
- xfs_ilock_demote(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
-/*
* Clear the specified ranges to zero through either the pagecache or DAX.
* Holes and unwritten extents will be left as-is as they already are zeroed.
*/
@@ -183,19 +149,16 @@ xfs_file_fsync(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- if (mp->m_flags & XFS_MOUNT_BARRIER) {
- /*
- * If we have an RT and/or log subvolume we need to make sure
- * to flush the write cache the device used for file data
- * first. This is to ensure newly written file data make
- * it to disk before logging the new inode size in case of
- * an extending write.
- */
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
- else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
- }
+ /*
+ * If we have an RT and/or log subvolume we need to make sure to flush
+ * the write cache the device used for file data first. This is to
+ * ensure newly written file data make it to disk before logging the new
+ * inode size in case of an extending write.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
/*
* All metadata updates are logged, which means that we just have to
@@ -230,10 +193,8 @@ xfs_file_fsync(
* an already allocated file and thus do not have any metadata to
* commit.
*/
- if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
- mp->m_logdev_targp == mp->m_ddev_targp &&
- !XFS_IS_REALTIME_INODE(ip) &&
- !log_flushed)
+ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+ mp->m_logdev_targp == mp->m_ddev_targp)
xfs_blkdev_issue_flush(mp->m_ddev_targp);
return error;
@@ -244,62 +205,21 @@ xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- loff_t isize = i_size_read(inode);
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
size_t count = iov_iter_count(to);
- loff_t end = iocb->ki_pos + count - 1;
- struct iov_iter data;
- struct xfs_buftarg *target;
- ssize_t ret = 0;
+ ssize_t ret;
trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
if (!count)
return 0; /* skip atime */
- if (XFS_IS_REALTIME_INODE(ip))
- target = ip->i_mount->m_rtdev_targp;
- else
- target = ip->i_mount->m_ddev_targp;
-
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
- if (iocb->ki_pos == isize)
- return 0;
- return -EINVAL;
- }
-
file_accessed(iocb->ki_filp);
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- if (ret)
- goto out_unlock;
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- /*
- * Invalidate whole pages. This can return an error if we fail
- * to invalidate a page, but this should never happen on XFS.
- * Warn if it does fail.
- */
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
-
- data = *to;
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, NULL, NULL, 0);
- if (ret >= 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(to, ret);
- }
-
-out_unlock:
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -317,9 +237,9 @@ xfs_file_dax_read(
if (!count)
return 0; /* skip atime */
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
return ret;
@@ -335,9 +255,9 @@ xfs_file_buffered_aio_read(
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
ret = generic_file_read_iter(iocb, to);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -418,15 +338,18 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, true);
+ error = xfs_break_layouts(inode, iolock);
if (error)
return error;
- /* For changing security info in file_remove_privs() we need i_mutex */
+ /*
+ * For changing security info in file_remove_privs() we need i_rwsem
+ * exclusively.
+ */
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
goto restart;
}
/*
@@ -451,9 +374,9 @@ restart:
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
iov_iter_reexpand(from, count);
}
/*
@@ -496,6 +419,58 @@ restart:
return 0;
}
+static int
+xfs_dio_write_end_io(
+ struct kiocb *iocb,
+ ssize_t size,
+ unsigned flags)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ loff_t offset = iocb->ki_pos;
+ bool update_size = false;
+ int error = 0;
+
+ trace_xfs_end_io_direct_write(ip, offset, size);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ if (size <= 0)
+ return size;
+
+ /*
+ * We need to update the in-core inode size here so that we don't end up
+ * with the on-disk inode size being outside the in-core inode size. We
+ * have no other method of updating EOF for AIO, so always do it here
+ * if necessary.
+ *
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (offset + size > i_size_read(inode)) {
+ i_size_write(inode, offset + size);
+ update_size = true;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (flags & IOMAP_DIO_COW) {
+ error = xfs_reflink_end_cow(ip, offset, size);
+ if (error)
+ return error;
+ }
+
+ if (flags & IOMAP_DIO_UNWRITTEN)
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ else if (update_size)
+ error = xfs_setfilesize(ip, offset, size);
+
+ return error;
+}
+
/*
* xfs_file_dio_aio_write - handle direct IO writes
*
@@ -535,9 +510,7 @@ xfs_file_dio_aio_write(
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
- loff_t end;
- struct iov_iter data;
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
@@ -559,29 +532,12 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
- end = iocb->ki_pos + count - 1;
-
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- if (ret)
- goto out;
-
- /*
- * Invalidate whole pages. This can return an error if we fail
- * to invalidate a page, but this should never happen on XFS.
- * Warn if it does fail.
- */
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
/*
* If we are doing unaligned IO, wait for all other IO to drain,
@@ -591,7 +547,7 @@ xfs_file_dio_aio_write(
if (unaligned_io)
inode_dio_wait(inode);
else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
@@ -604,24 +560,9 @@ xfs_file_dio_aio_write(
goto out;
}
- data = *from;
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, xfs_end_io_direct_write,
- NULL, DIO_ASYNC_EXTEND);
-
- /* see generic_file_direct_write() for why this is necessary */
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- }
-
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(from, ret);
- }
+ ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
/*
* No fallback to buffered IO on errors for XFS, direct IO will either
@@ -643,7 +584,7 @@ xfs_file_dax_write(
size_t count;
loff_t pos;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
@@ -652,15 +593,13 @@ xfs_file_dax_write(
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos);
-
- ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
-
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return error ? error : ret;
}
@@ -677,7 +616,7 @@ xfs_file_buffered_aio_write(
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
@@ -721,7 +660,7 @@ write_retry:
current->backing_dev_info = NULL;
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return ret;
}
@@ -797,7 +736,7 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, false);
+ error = xfs_break_layouts(inode, &iolock);
if (error)
goto out_unlock;
@@ -909,24 +848,6 @@ out_unlock:
return error;
}
-STATIC ssize_t
-xfs_file_copy_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- size_t len,
- unsigned int flags)
-{
- int error;
-
- error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
- if (error)
- return error;
- return len;
-}
-
STATIC int
xfs_file_clone_range(
struct file *file_in,
@@ -939,7 +860,6 @@ xfs_file_clone_range(
len, false);
}
-#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
STATIC ssize_t
xfs_file_dedupe_range(
struct file *src_file,
@@ -950,14 +870,6 @@ xfs_file_dedupe_range(
{
int error;
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > XFS_MAX_DEDUPE_LEN)
- len = XFS_MAX_DEDUPE_LEN;
-
error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
len, true);
if (error)
@@ -1474,7 +1386,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
@@ -1501,15 +1413,9 @@ xfs_filemap_fault(
return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- if (IS_DAX(inode)) {
- /*
- * we do not want to trigger unwritten extent conversion on read
- * faults - that is unnecessary overhead and would also require
- * changes to xfs_get_blocks_direct() to map unwritten extent
- * ioend for conversion on read-only mappings.
- */
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
- } else
+ if (IS_DAX(inode))
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+ else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1545,7 +1451,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+ ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
@@ -1625,7 +1531,6 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
- .copy_file_range = xfs_file_copy_range,
.clone_file_range = xfs_file_clone_range,
.dedupe_file_range = xfs_file_dedupe_range,
};
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 93d12fa2670d..242e8091296d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -631,6 +631,20 @@ xfs_growfs_data_private(
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ /*
+ * If we expanded the last AG, free the per-AG reservation
+ * so we can reinitialize it with the new size.
+ */
+ if (new) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_ag_resv_free(pag);
+ xfs_perag_put(pag);
+ if (error)
+ goto out;
+ }
+
/* Reserve AG metadata blocks. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f295049db681..70ca4f608321 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -70,8 +70,6 @@ xfs_inode_alloc(
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
/* initialise the xfs inode */
ip->i_ino = ino;
ip->i_mount = mp;
@@ -123,7 +121,6 @@ __xfs_inode_free(
{
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -133,6 +130,8 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
+ ASSERT(!xfs_isiflocked(ip));
+
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -393,8 +392,8 @@ xfs_iget_cache_hit(
xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+ init_rwsem(&inode->i_rwsem);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
@@ -981,6 +980,7 @@ restart:
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
+ /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip, false);
goto reclaim;
}
@@ -989,10 +989,10 @@ restart:
goto out_ifunlock;
xfs_iunpin_wait(ip);
}
- if (xfs_iflags_test(ip, XFS_ISTALE))
- goto reclaim;
- if (xfs_inode_clean(ip))
+ if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
goto reclaim;
+ }
/*
* Never flush out dirty data during non-blocking reclaim, as it would
@@ -1030,25 +1030,24 @@ restart:
xfs_buf_relse(bp);
}
- xfs_iflock(ip);
reclaim:
+ ASSERT(!xfs_isiflocked(ip));
+
/*
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
- * We do this as early as possible under the ILOCK and flush lock so
- * that xfs_iflush_cluster() can be guaranteed to detect races with us
- * here. By doing this, we guarantee that once xfs_iflush_cluster has
- * locked both the XFS_ILOCK and the flush lock that it will see either
- * a valid, flushable inode that will serialise correctly against the
- * locks below, or it will see a clean (and invalid) inode that it can
- * skip.
+ * We do this as early as possible under the ILOCK so that
+ * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+ * By doing this, we guarantee that once xfs_iflush_cluster has locked
+ * XFS_ILOCK that it will see either a valid, flushable inode that will
+ * serialise correctly, or it will see a clean (and invalid) inode that
+ * it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
- xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks(
struct xfs_eofblocks *eofb = args;
bool need_iolock = true;
int match;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
- if (!xfs_reflink_has_real_cow_blocks(ip)) {
+ /*
+ * Just clear the tag if we have an empty cow fork or none at all. It's
+ * possible the inode was fully unshared since it was originally tagged.
+ */
+ if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) {
trace_xfs_inode_free_cowblocks_invalid(ip);
xfs_inode_clear_cowblocks_tag(ip);
return 0;
@@ -1593,7 +1597,8 @@ xfs_inode_free_cowblocks(
* If the mapping is dirty or under writeback we cannot touch the
* CoW fork. Leave it alone if we're in the midst of a directio.
*/
- if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+ if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&VFS_I(ip)->i_dio_count))
return 0;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d45ca72af6fb..865ad1373e5e 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -133,7 +133,7 @@ xfs_icreate_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_icreate_item_ops = {
+static const struct xfs_item_ops xfs_icreate_item_ops = {
.iop_size = xfs_icreate_item_size,
.iop_format = xfs_icreate_item_format,
.iop_pin = xfs_icreate_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4e560e6a12c1..de32f0fe47c8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared(
}
/*
- * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * the i_lock. This routine allows various combinations of the locks to be
- * obtained.
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
* the mmap lock second and the ilock last in order to prevent deadlock.
*
* Basic locking order:
*
- * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
*
* mmap_sem locking order:
*
- * i_iolock -> page lock -> mmap_sem
+ * i_rwsem -> page lock -> mmap_sem
* mmap_sem -> i_mmap_lock -> page_lock
*
* The difference in mmap_sem locking order mean that we cannot hold the
* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
* fault in pages during copy in/out (for buffered IO) or require the mmap_sem
* in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
* page faults already hold the mmap_sem.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
* taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
@@ -191,10 +191,13 @@ xfs_ilock(
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_rwsem,
+ XFS_IOLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_rwsem,
+ XFS_IOLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_iolock))
+ if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
goto out;
} else if (lock_flags & XFS_IOLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_iolock))
+ if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
goto out;
}
@@ -271,9 +274,9 @@ out_undo_mmaplock:
mrunlock_shared(&ip->i_mmaplock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
+ up_write(&VFS_I(ip)->i_rwsem);
else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
+ up_read(&VFS_I(ip)->i_rwsem);
out:
return 0;
}
@@ -310,9 +313,9 @@ xfs_iunlock(
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
+ up_write(&VFS_I(ip)->i_rwsem);
else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
+ up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrunlock_excl(&ip->i_mmaplock);
@@ -345,7 +348,7 @@ xfs_ilock_demote(
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrdemote(&ip->i_mmaplock);
if (lock_flags & XFS_IOLOCK_EXCL)
- mrdemote(&ip->i_iolock);
+ downgrade_write(&VFS_I(ip)->i_rwsem);
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
@@ -370,8 +373,9 @@ xfs_isilocked(
if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !!ip->i_iolock.mr_writer;
- return rwsem_is_locked(&ip->i_iolock.mr_lock);
+ return !debug_locks ||
+ lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
+ return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
}
ASSERT(0);
@@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
- ASSERT(xfs_lockdep_subclass_ok(subclass +
- XFS_IOLOCK_PARENT_VAL));
class += subclass << XFS_IOLOCK_SHIFT;
- if (lock_mode & XFS_IOLOCK_PARENT)
- class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
}
if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -477,8 +477,6 @@ xfs_lock_inodes(
XFS_ILOCK_EXCL));
ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
XFS_ILOCK_SHARED)));
- ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
- inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -581,10 +579,8 @@ xfs_lock_two_inodes(
int attempts = 0;
xfs_log_item_t *lp;
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
- ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
- ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+ ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+ if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
@@ -715,7 +711,6 @@ xfs_lookup(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
if (error)
goto out_unlock;
@@ -724,14 +719,12 @@ xfs_lookup(
if (error)
goto out_free_name;
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return 0;
out_free_name:
if (ci_name)
kmem_free(ci_name->name);
out_unlock:
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
*ipp = NULL;
return error;
}
@@ -1215,8 +1208,7 @@ xfs_create(
if (error)
goto out_release_inode;
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
- XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
xfs_defer_init(&dfops, &first_block);
@@ -1252,7 +1244,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1325,7 +1317,7 @@ xfs_create(
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
@@ -1466,11 +1458,10 @@ xfs_link(
if (error)
goto std_return;
- xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow hard link
@@ -1801,22 +1792,23 @@ xfs_inactive_ifree(
int error;
/*
- * The ifree transaction might need to allocate blocks for record
- * insertion to the finobt. We don't want to fail here at ENOSPC, so
- * allow ifree to dip into the reserved block pool if necessary.
- *
- * Freeing large sets of inodes generally means freeing inode chunks,
- * directory and file data blocks, so this should be relatively safe.
- * Only under severe circumstances should it be possible to free enough
- * inodes to exhaust the reserve block pool via finobt expansion while
- * at the same time not creating free space in the filesystem.
+ * We try to use a per-AG reservation for any block needed by the finobt
+ * tree, but as the finobt feature predates the per-AG reservation
+ * support a degraded file system might not have enough space for the
+ * reservation at mount time. In that case try to dip into the reserved
+ * pool and pray.
*
* Send a warning if the reservation does happen to fail, as the inode
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
- XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ if (unlikely(mp->m_inotbt_nores)) {
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+ &tp);
+ } else {
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+ }
if (error) {
if (error == -ENOSPC) {
xfs_warn_ratelimited(mp,
@@ -2041,7 +2033,6 @@ xfs_iunlink(
agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
return 0;
@@ -2133,7 +2124,6 @@ xfs_iunlink_remove(
agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
} else {
@@ -2579,10 +2569,9 @@ xfs_remove(
goto std_return;
}
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
@@ -2963,12 +2952,6 @@ xfs_rename(
* whether the target directory is the same as the source
* directory, we can lock from 2 to 4 inodes.
*/
- if (!new_parent)
- xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
- else
- xfs_lock_two_inodes(src_dp, target_dp,
- XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
@@ -2976,9 +2959,9 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f14c1de2549d..10dcf27b4c85 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,7 +56,6 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
- mrlock_t i_iolock; /* inode IO lock */
mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
spinlock_t i_flags_lock; /* inode i_flags lock */
@@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
* Synchronize processes attempting to flush the in-core inode back to disk.
*/
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
extern void __xfs_iflock(struct xfs_inode *ip);
static inline int xfs_iflock_nowait(struct xfs_inode *ip)
@@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)
static inline void xfs_ifunlock(struct xfs_inode *ip)
{
+ ASSERT(xfs_isiflocked(ip));
xfs_iflags_clear(ip, XFS_IFLOCK);
smp_mb();
wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
}
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* IOLOCK values
*
* 0-3 subclass value
- * 4-7 PARENT subclass values
+ * 4-7 unused
*
* MMAPLOCK values
*
@@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
*
*/
#define XFS_IOLOCK_SHIFT 16
-#define XFS_IOLOCK_PARENT_VAL 4
-#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_MAX_SUBCLASS 3
#define XFS_IOLOCK_DEP_MASK 0x000f0000
-#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9610e9c00952..d90e7811ccdd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork(
struct xfs_bmbt_rec *p;
ASSERT(ip->i_df.if_u1.if_extents != NULL);
- ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
+ ASSERT(xfs_iext_count(&ip->i_df) > 0);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
@@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork(
ip->i_afp->if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+ ASSERT(xfs_iext_count(ip->i_afp) ==
ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c245bed3249b..c67cfb451fd3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -287,7 +287,7 @@ xfs_readlink_by_handle(
return PTR_ERR(dentry);
/* Restrict this handle operation to symlinks only. */
- if (!d_inode(dentry)->i_op->readlink) {
+ if (!d_is_symlink(dentry)) {
error = -EINVAL;
goto out_dput;
}
@@ -297,7 +297,7 @@ xfs_readlink_by_handle(
goto out_dput;
}
- error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
+ error = vfs_readlink(dentry, hreq->ohandle, olen);
out_dput:
dput(dentry);
@@ -639,7 +639,7 @@ xfs_ioc_space(
return error;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, false);
+ error = xfs_break_layouts(inode, &iolock);
if (error)
goto out_unlock;
@@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr(
if (attr) {
if (ip->i_afp) {
if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = ip->i_afp->if_bytes /
- sizeof(xfs_bmbt_rec_t);
+ fa.fsx_nextents = xfs_iext_count(ip->i_afp);
else
fa.fsx_nextents = ip->i_d.di_anextents;
} else
fa.fsx_nextents = 0;
} else {
if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = ip->i_df.if_bytes /
- sizeof(xfs_bmbt_rec_t);
+ fa.fsx_nextents = xfs_iext_count(&ip->i_df);
else
fa.fsx_nextents = ip->i_d.di_nextents;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 321f57721b92..7c49938c5aed 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -19,7 +19,7 @@
#include <linux/ioctl.h>
#include <linux/mount.h>
#include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 436e109bb01e..1aa3abd67b36 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -395,11 +395,12 @@ xfs_iomap_prealloc_size(
struct xfs_inode *ip,
loff_t offset,
loff_t count,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *prev)
+ xfs_extnum_t idx)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ struct xfs_bmbt_irec prev;
int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
@@ -419,8 +420,8 @@ xfs_iomap_prealloc_size(
*/
if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- idx == 0 ||
- prev->br_startoff + prev->br_blockcount < offset_fsb)
+ !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+ prev.br_startoff + prev.br_blockcount < offset_fsb)
return mp->m_writeio_blocks;
/*
@@ -439,8 +440,8 @@ xfs_iomap_prealloc_size(
* always extends to MAXEXTLEN rather than falling short due to things
* like stripe unit/width alignment of real extents.
*/
- if (prev->br_blockcount <= (MAXEXTLEN >> 1))
- alloc_blocks = prev->br_blockcount << 1;
+ if (prev.br_blockcount <= (MAXEXTLEN >> 1))
+ alloc_blocks = prev.br_blockcount << 1;
else
alloc_blocks = XFS_B_TO_FSB(mp, offset);
if (!alloc_blocks)
@@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- xfs_fileoff_t end_fsb, orig_end_fsb;
+ xfs_fileoff_t end_fsb;
int error = 0, eof = 0;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec prev;
xfs_extnum_t idx;
+ xfs_fsblock_t prealloc_blocks = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
- xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
- &got, &prev);
+ eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
if (!eof && got.br_startoff <= offset_fsb) {
if (xfs_is_reflink_inode(ip)) {
bool shared;
@@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay(
* the lower level functions are updated.
*/
count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = orig_end_fsb =
- min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
if (eof) {
- xfs_fsblock_t prealloc_blocks;
-
- prealloc_blocks =
- xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
+ xfs_fileoff_t p_end_fsb;
end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
- end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
- prealloc_blocks;
+ p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
align = xfs_eof_alignment(ip, 0);
if (align)
- end_fsb = roundup_64(end_fsb, align);
+ p_end_fsb = roundup_64(p_end_fsb, align);
- end_fsb = min(end_fsb, maxbytes_fsb);
- ASSERT(end_fsb > offset_fsb);
+ p_end_fsb = min(p_end_fsb, maxbytes_fsb);
+ ASSERT(p_end_fsb > offset_fsb);
+ prealloc_blocks = p_end_fsb - end_fsb;
}
}
retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, &got,
- &prev, &idx, eof);
+ end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
switch (error) {
case 0:
break;
@@ -631,8 +628,8 @@ retry:
case -EDQUOT:
/* retry without any preallocation */
trace_xfs_delalloc_enospc(ip, offset, count);
- if (end_fsb != orig_end_fsb) {
- end_fsb = orig_end_fsb;
+ if (prealloc_blocks) {
+ prealloc_blocks = 0;
goto retry;
}
/*FALLTHRU*/
@@ -640,13 +637,6 @@ retry:
goto out_unlock;
}
- /*
- * Tag the inode as speculatively preallocated so we can reclaim this
- * space on demand, if necessary.
- */
- if (end_fsb != orig_end_fsb)
- xfs_inode_set_eofblocks_tag(ip);
-
trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
done:
if (isnullstartblock(got.br_startblock))
@@ -691,7 +681,7 @@ xfs_iomap_write_allocate(
xfs_trans_t *tp;
int nimaps;
int error = 0;
- int flags = 0;
+ int flags = XFS_BMAPI_DELALLOC;
int nres;
if (whichfork == XFS_COW_FORK)
@@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode,
(IS_DAX(inode) && ISUNWRITTEN(imap));
}
+static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+{
+ /*
+ * COW writes will allocate delalloc space, so we need to make sure
+ * to take the lock exclusively here.
+ */
+ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ return true;
+ if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+ return true;
+ return false;
+}
+
static int
xfs_file_iomap_begin(
struct inode *inode,
@@ -979,18 +982,14 @@ xfs_file_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
- !xfs_get_extsz_hint(ip)) {
+ if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+ !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
return xfs_file_iomap_begin_delay(inode, offset, length, flags,
iomap);
}
- /*
- * COW writes will allocate delalloc space, so we need to make sure
- * to take the lock exclusively here.
- */
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ if (need_excl_ilock(ip, flags)) {
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
} else {
@@ -1003,17 +1002,41 @@ xfs_file_iomap_begin(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ if (xfs_is_reflink_inode(ip) &&
+ (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
+ shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
+ if (shared) {
+ xfs_iunlock(ip, lockmode);
+ goto alloc_done;
+ }
+ ASSERT(!isnullstartblock(imap.br_startblock));
+ }
+
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
if (error)
goto out_unlock;
- if (flags & IOMAP_REPORT) {
+ if ((flags & IOMAP_REPORT) ||
+ (xfs_is_reflink_inode(ip) &&
+ (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
/* Trim the mapping to the nearest shared extent boundary. */
error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
&trimmed);
if (error)
goto out_unlock;
+
+ /*
+ * We're here because we're trying to do a directio write to a
+ * region that isn't aligned to a filesystem block. If the
+ * extent is shared, fall back to buffered mode to handle the
+ * RMW.
+ */
+ if (!(flags & IOMAP_REPORT) && shared) {
+ trace_xfs_reflink_bounce_dio_write(ip, &imap);
+ error = -EREMCHG;
+ goto out_unlock;
+ }
}
if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
@@ -1048,6 +1071,7 @@ xfs_file_iomap_begin(
if (error)
return error;
+alloc_done:
iomap->flags = IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
} else {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 405a65cd9d6b..22c16155f1b4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -98,12 +98,27 @@ xfs_init_security(
static void
xfs_dentry_to_name(
struct xfs_name *namep,
+ struct dentry *dentry)
+{
+ namep->name = dentry->d_name.name;
+ namep->len = dentry->d_name.len;
+ namep->type = XFS_DIR3_FT_UNKNOWN;
+}
+
+static int
+xfs_dentry_mode_to_name(
+ struct xfs_name *namep,
struct dentry *dentry,
int mode)
{
namep->name = dentry->d_name.name;
namep->len = dentry->d_name.len;
- namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
+ namep->type = xfs_mode_to_ftype(mode);
+
+ if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN))
+ return -EFSCORRUPTED;
+
+ return 0;
}
STATIC void
@@ -119,7 +134,7 @@ xfs_cleanup_inode(
* xfs_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
- xfs_dentry_to_name(&teardown, dentry, 0);
+ xfs_dentry_to_name(&teardown, dentry);
xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
}
@@ -154,8 +169,12 @@ xfs_generic_create(
if (error)
return error;
+ /* Verify mode is valid also for tmpfile case */
+ error = xfs_dentry_mode_to_name(&name, dentry, mode);
+ if (unlikely(error))
+ goto out_free_acl;
+
if (!tmpfile) {
- xfs_dentry_to_name(&name, dentry, mode);
error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
} else {
error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
@@ -248,7 +267,7 @@ xfs_vn_lookup(
if (dentry->d_name.len >= MAXNAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- xfs_dentry_to_name(&name, dentry, 0);
+ xfs_dentry_to_name(&name, dentry);
error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
if (unlikely(error)) {
if (unlikely(error != -ENOENT))
@@ -275,7 +294,7 @@ xfs_vn_ci_lookup(
if (dentry->d_name.len >= MAXNAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- xfs_dentry_to_name(&xname, dentry, 0);
+ xfs_dentry_to_name(&xname, dentry);
error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
if (unlikely(error)) {
if (unlikely(error != -ENOENT))
@@ -310,7 +329,9 @@ xfs_vn_link(
struct xfs_name name;
int error;
- xfs_dentry_to_name(&name, dentry, inode->i_mode);
+ error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode);
+ if (unlikely(error))
+ return error;
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
if (unlikely(error))
@@ -329,7 +350,7 @@ xfs_vn_unlink(
struct xfs_name name;
int error;
- xfs_dentry_to_name(&name, dentry, 0);
+ xfs_dentry_to_name(&name, dentry);
error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
if (error)
@@ -359,7 +380,9 @@ xfs_vn_symlink(
mode = S_IFLNK |
(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
- xfs_dentry_to_name(&name, dentry, mode);
+ error = xfs_dentry_mode_to_name(&name, dentry, mode);
+ if (unlikely(error))
+ goto out;
error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
if (unlikely(error))
@@ -395,6 +418,7 @@ xfs_vn_rename(
{
struct inode *new_inode = d_inode(ndentry);
int omode = 0;
+ int error;
struct xfs_name oname;
struct xfs_name nname;
@@ -405,8 +429,14 @@ xfs_vn_rename(
if (flags & RENAME_EXCHANGE)
omode = d_inode(ndentry)->i_mode;
- xfs_dentry_to_name(&oname, odentry, omode);
- xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode);
+ error = xfs_dentry_mode_to_name(&oname, odentry, omode);
+ if (omode && unlikely(error))
+ return error;
+
+ error = xfs_dentry_mode_to_name(&nname, ndentry,
+ d_inode(odentry)->i_mode);
+ if (unlikely(error))
+ return error;
return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
XFS_I(ndir), &nname,
@@ -983,15 +1013,13 @@ xfs_vn_setattr(
struct xfs_inode *ip = XFS_I(d_inode(dentry));
uint iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
- error = xfs_break_layouts(d_inode(dentry), &iolock, true);
- if (!error) {
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- iolock |= XFS_MMAPLOCK_EXCL;
+ error = xfs_break_layouts(d_inode(dentry), &iolock);
+ if (error)
+ return error;
- error = xfs_vn_setattr_size(dentry, iattr);
- }
- xfs_iunlock(ip, iolock);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_vn_setattr_size(dentry, iattr);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
error = xfs_vn_setattr_nonsize(dentry, iattr);
}
@@ -1122,7 +1150,6 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
};
static const struct inode_operations xfs_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1131,7 +1158,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
};
static const struct inode_operations xfs_inline_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = xfs_vn_get_link_inline,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 68640fb63a54..7a989de224f4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -78,11 +78,12 @@ typedef __u32 xfs_nlink_t;
#include <linux/freezer.h>
#include <linux/list_sort.h>
#include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
#include <asm/page.h>
#include <asm/div64.h>
#include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -330,11 +331,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
}
#define ASSERT_ALWAYS(expr) \
- (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#ifdef DEBUG
#define ASSERT(expr) \
- (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#ifndef STATIC
# define STATIC noinline
@@ -345,7 +346,7 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
#ifdef XFS_WARN
#define ASSERT(expr) \
- (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
#ifndef STATIC
# define STATIC static noinline
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3b74fa011bb1..b1469f0a91a6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1668,7 +1668,7 @@ xlog_cksum(
__uint32_t crc;
/* first generate the crc for the record header ... */
- crc = xfs_start_cksum((char *)rhead,
+ crc = xfs_start_cksum_update((char *)rhead,
sizeof(struct xlog_rec_header),
offsetof(struct xlog_rec_header, h_crc));
@@ -1862,26 +1862,21 @@ xlog_sync(
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
- bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
+ bp->b_flags &= ~XBF_FLUSH;
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
- if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
- bp->b_flags |= XBF_FUA;
-
- /*
- * Flush the data device before flushing the log to make
- * sure all meta data written back from the AIL actually made
- * it to disk before stamping the new log tail LSN into the
- * log buffer. For an external log we need to issue the
- * flush explicitly, and unfortunately synchronously here;
- * for an internal log we can simply use the block layer
- * state machine for preflushes.
- */
- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
- xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- else
- bp->b_flags |= XBF_FLUSH;
- }
+ /*
+ * Flush the data device before flushing the log to make sure all meta
+ * data written back from the AIL actually made it to disk before
+ * stamping the new log tail LSN into the log buffer. For an external
+ * log we need to issue the flush explicitly, and unfortunately
+ * synchronously here; for an internal log we can simply use the block
+ * layer state machine for preflushes.
+ */
+ if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+ else
+ bp->b_flags |= XBF_FLUSH;
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1906,10 +1901,8 @@ xlog_sync(
xfs_buf_associate_memory(bp,
(char *)&iclog->ic_header + count, split);
bp->b_fspriv = iclog;
- bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
- if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
- bp->b_flags |= XBF_FUA;
+ bp->b_flags &= ~XBF_FLUSH;
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -3324,12 +3317,8 @@ xfs_log_force(
xfs_mount_t *mp,
uint flags)
{
- int error;
-
trace_xfs_log_force(mp, 0, _RET_IP_);
- error = _xfs_log_force(mp, flags, NULL);
- if (error)
- xfs_warn(mp, "%s: error %d returned.", __func__, error);
+ _xfs_log_force(mp, flags, NULL);
}
/*
@@ -3473,12 +3462,8 @@ xfs_log_force_lsn(
xfs_lsn_t lsn,
uint flags)
{
- int error;
-
trace_xfs_log_force(mp, lsn, _RET_IP_);
- error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
- if (error)
- xfs_warn(mp, "%s: error %d returned.", __func__, error);
+ _xfs_log_force_lsn(mp, lsn, flags, NULL);
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c76915d..4a98762ec8b4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- ushort flags)
+ unsigned short flags)
{
struct list_head *bucket;
struct xfs_buf_cancel *bcp;
@@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- ushort flags)
+ unsigned short flags)
{
struct xfs_buf_cancel *bcp;
@@ -5113,19 +5113,21 @@ xlog_recover_process(
struct list_head *buffer_list)
{
int error;
+ __le32 old_crc = rhead->h_crc;
__le32 crc;
+
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
/*
* Nothing else to do if this is a CRC verification pass. Just return
* if this a record with a non-zero crc. Unfortunately, mkfs always
- * sets h_crc to 0 so we must consider this valid even on v5 supers.
+ * sets old_crc to 0 so we must consider this valid even on v5 supers.
* Otherwise, return EFSBADCRC on failure so the callers up the stack
* know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (rhead->h_crc && crc != rhead->h_crc)
+ if (old_crc && crc != old_crc)
return -EFSBADCRC;
return 0;
}
@@ -5136,11 +5138,11 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != rhead->h_crc) {
- if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (crc != old_crc) {
+ if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
- le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(old_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b341f10cf481..9b9540db17a6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -157,6 +157,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
@@ -212,8 +213,8 @@ xfs_initialize_perag(
spin_lock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- spin_lock_init(&pag->pag_buf_lock);
- pag->pag_buf_tree = RB_ROOT;
+ if (xfs_buf_hash_init(pag))
+ goto out_unwind;
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
return 0;
out_unwind:
+ xfs_buf_hash_destroy(pag);
kmem_free(pag);
for (; index > first_initialised; index--) {
pag = radix_tree_delete(&mp->m_perag_tree, index);
+ xfs_buf_hash_destroy(pag);
kmem_free(pag);
}
return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 819b80b15bfb..7f351f706b7a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
+ bool m_inotbt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -393,8 +394,8 @@ typedef struct xfs_perag {
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
- struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
+ struct rhashtable pag_buf_hash;
/* for rcu-safe freeing */
struct rcu_head rcu_head;
@@ -424,6 +425,9 @@ xfs_perag_resv(
}
}
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 93a7aafa56d6..2f2dc3c09ad0 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,8 +32,7 @@
int
xfs_break_layouts(
struct inode *inode,
- uint *iolock,
- bool with_imutex)
+ uint *iolock)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
- if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
- inode_unlock(inode);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
- if (with_imutex)
- inode_lock(inode);
xfs_ilock(ip, *iolock);
}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index e8339f74966b..b587cb99b2b7 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
#else
static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+xfs_break_layouts(struct inode *inode, uint *iolock)
{
return 0;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a60d9e2739d1..b669b123287b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks(
return error;
}
rtblks = 0;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (idx = 0; idx < nextents; idx++)
rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
*O_rtblks = (xfs_qcnt_t)rtblks;
@@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust(
* the case in all other instances. It's OK that we do this because
* quotacheck is done only at mount time.
*/
- error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
+ &ip);
if (error) {
*res = BULKSTAT_RV_NOTHING;
return error;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe86a668a57e..6e4c7446c3d4 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -526,13 +526,14 @@ xfs_cui_recover(
xfs_refcount_finish_one_cleanup(tp, rcur, error);
error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
- goto abort_error;
+ goto abort_defer;
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
error = xfs_trans_commit(tp);
return error;
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
+abort_defer:
xfs_defer_cancel(&dfops);
xfs_trans_cancel(tp);
return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a279b4e7f5fe..07593a362cd0 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -243,12 +243,11 @@ xfs_reflink_reserve_cow(
struct xfs_bmbt_irec *imap,
bool *shared)
{
- struct xfs_bmbt_irec got, prev;
- xfs_fileoff_t end_fsb, orig_end_fsb;
- int eof = 0, error = 0;
- bool trimmed;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got;
+ int error = 0;
+ bool eof = false, trimmed;
xfs_extnum_t idx;
- xfs_extlen_t align;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -258,8 +257,9 @@ xfs_reflink_reserve_cow(
* extent list is generally faster than going out to the shared extent
* tree.
*/
- xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+ eof = true;
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
@@ -285,33 +285,12 @@ xfs_reflink_reserve_cow(
if (error)
return error;
- end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
-
- align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
- if (align)
- end_fsb = roundup_64(end_fsb, align);
-
-retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
+ imap->br_blockcount, 0, &got, &idx, eof);
+ if (error == -ENOSPC || error == -EDQUOT)
trace_xfs_reflink_cow_enospc(ip, imap);
- if (end_fsb != orig_end_fsb) {
- end_fsb = orig_end_fsb;
- goto retry;
- }
- /*FALLTHRU*/
- default:
+ if (error)
return error;
- }
-
- if (end_fsb != orig_end_fsb)
- xfs_inode_set_cowblocks_tag(ip);
trace_xfs_reflink_cow_alloc(ip, &got);
return 0;
@@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range(
}
/*
- * Find the CoW reservation (and whether or not it needs block allocation)
- * for a given byte offset of a file.
+ * Find the CoW reservation for a given byte offset of a file.
*/
bool
xfs_reflink_find_cow_mapping(
struct xfs_inode *ip,
xfs_off_t offset,
- struct xfs_bmbt_irec *imap,
- bool *need_alloc)
+ struct xfs_bmbt_irec *imap)
{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
- xfs_fileoff_t bno;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ xfs_fileoff_t offset_fsb;
+ struct xfs_bmbt_irec got;
xfs_extnum_t idx;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(xfs_is_reflink_inode(ip));
- /* Find the extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- bno = XFS_B_TO_FSBT(ip->i_mount, offset);
- gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
- if (!gotp)
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
return false;
-
- xfs_bmbt_get_all(gotp, &irec);
- if (bno >= irec.br_startoff + irec.br_blockcount ||
- bno < irec.br_startoff)
+ if (got.br_startoff > offset_fsb)
return false;
trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
- &irec);
-
- /* If it's still delalloc, we must allocate later. */
- *imap = irec;
- *need_alloc = !!(isnullstartblock(irec.br_startblock));
-
+ &got);
+ *imap = got;
return true;
}
/*
* Trim an extent to end at the next CoW reservation past offset_fsb.
*/
-int
+void
xfs_reflink_trim_irec_to_next_cow(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
struct xfs_bmbt_irec *imap)
{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got;
xfs_extnum_t idx;
if (!xfs_is_reflink_inode(ip))
- return 0;
+ return;
/* Find the extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
- if (!gotp)
- return 0;
- xfs_bmbt_get_all(gotp, &irec);
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ return;
/* This is the extent before; try sliding up one. */
- if (irec.br_startoff < offset_fsb) {
- idx++;
- if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- return 0;
- gotp = xfs_iext_get_ext(ifp, idx);
- xfs_bmbt_get_all(gotp, &irec);
+ if (got.br_startoff < offset_fsb) {
+ if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+ return;
}
- if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
- return 0;
+ if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
+ return;
- imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+ imap->br_blockcount = got.br_startoff - imap->br_startoff;
trace_xfs_reflink_trim_irec(ip, imap);
-
- return 0;
}
/*
@@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, prev, del;
+ struct xfs_bmbt_irec got, del;
xfs_extnum_t idx;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error = 0, eof = 0;
+ int error = 0;
if (!xfs_is_reflink_inode(ip))
return 0;
-
- xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
- if (eof)
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
return 0;
while (got.br_startoff < end_fsb) {
@@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
}
- if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+ if (!xfs_iext_get_extent(ifp, ++idx, &got))
break;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
/* clear tag if cow fork is emptied */
@@ -638,13 +591,13 @@ xfs_reflink_end_cow(
xfs_off_t count)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, prev, del;
+ struct xfs_bmbt_irec got, del;
struct xfs_trans *tp;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error, eof = 0;
+ int error;
unsigned int resblks;
xfs_filblks_t rlen;
xfs_extnum_t idx;
@@ -668,13 +621,11 @@ xfs_reflink_end_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
-
/* If there is a hole at end_fsb - 1 go to the previous extent */
- if (eof || got.br_startoff > end_fsb) {
+ if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
+ got.br_startoff > end_fsb) {
ASSERT(idx > 0);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+ xfs_iext_get_extent(ifp, --idx, &got);
}
/* Walk backwards until we're out of the I/O range... */
@@ -722,11 +673,9 @@ xfs_reflink_end_cow(
error = xfs_defer_finish(&tp, &dfops, ip);
if (error)
goto out_defer;
-
next_extent:
- if (idx < 0)
+ if (!xfs_iext_get_extent(ifp, idx, &got))
break;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
error = xfs_trans_commit(tp);
@@ -1165,111 +1114,6 @@ err:
}
/*
- * Read a page's worth of file data into the page cache. Return the page
- * locked.
- */
-static struct page *
-xfs_get_page(
- struct inode *inode,
- xfs_off_t offset)
-{
- struct address_space *mapping;
- struct page *page;
- pgoff_t n;
-
- n = offset >> PAGE_SHIFT;
- mapping = inode->i_mapping;
- page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- lock_page(page);
- return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- */
-static int
-xfs_compare_extents(
- struct inode *src,
- xfs_off_t srcoff,
- struct inode *dest,
- xfs_off_t destoff,
- xfs_off_t len,
- bool *is_same)
-{
- xfs_off_t src_poff;
- xfs_off_t dest_poff;
- void *src_addr;
- void *dest_addr;
- struct page *src_page;
- struct page *dest_page;
- xfs_off_t cmp_len;
- bool same;
- int error;
-
- error = -EINVAL;
- same = true;
- while (len) {
- src_poff = srcoff & (PAGE_SIZE - 1);
- dest_poff = destoff & (PAGE_SIZE - 1);
- cmp_len = min(PAGE_SIZE - src_poff,
- PAGE_SIZE - dest_poff);
- cmp_len = min(cmp_len, len);
- ASSERT(cmp_len > 0);
-
- trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
- XFS_I(dest), destoff);
-
- src_page = xfs_get_page(src, srcoff);
- if (IS_ERR(src_page)) {
- error = PTR_ERR(src_page);
- goto out_error;
- }
- dest_page = xfs_get_page(dest, destoff);
- if (IS_ERR(dest_page)) {
- error = PTR_ERR(dest_page);
- unlock_page(src_page);
- put_page(src_page);
- goto out_error;
- }
- src_addr = kmap_atomic(src_page);
- dest_addr = kmap_atomic(dest_page);
-
- flush_dcache_page(src_page);
- flush_dcache_page(dest_page);
-
- if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- same = false;
-
- kunmap_atomic(dest_addr);
- kunmap_atomic(src_addr);
- unlock_page(dest_page);
- unlock_page(src_page);
- put_page(dest_page);
- put_page(src_page);
-
- if (!same)
- break;
-
- srcoff += cmp_len;
- destoff += cmp_len;
- len -= cmp_len;
- }
-
- *is_same = same;
- return 0;
-
-out_error:
- trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
- return error;
-}
-
-/*
* Link a range of blocks from one file to another.
*/
int
@@ -1286,14 +1130,11 @@ xfs_reflink_remap_range(
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
- loff_t bs = inode_out->i_sb->s_blocksize;
bool same_inode = (inode_in == inode_out);
xfs_fileoff_t sfsbno, dfsbno;
xfs_filblks_t fsblen;
xfs_extlen_t cowextsize;
- loff_t isize;
ssize_t ret;
- loff_t blen;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
@@ -1302,34 +1143,14 @@ xfs_reflink_remap_range(
return -EIO;
/* Lock both files against IO */
- if (same_inode) {
- xfs_ilock(src, XFS_IOLOCK_EXCL);
+ lock_two_nondirectories(inode_in, inode_out);
+ if (same_inode)
xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- } else {
- xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+ else
xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
- }
-
- /* Don't touch certain kinds of inodes */
- ret = -EPERM;
- if (IS_IMMUTABLE(inode_out))
- goto out_unlock;
- ret = -ETXTBSY;
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- goto out_unlock;
-
-
- /* Don't reflink dirs, pipes, sockets... */
- ret = -EISDIR;
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- goto out_unlock;
+ /* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
- if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
- goto out_unlock;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- goto out_unlock;
-
/* Don't reflink realtime inodes */
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
@@ -1338,91 +1159,18 @@ xfs_reflink_remap_range(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
- /* Are we going all the way to the end? */
- isize = i_size_read(inode_in);
- if (isize == 0) {
- ret = 0;
- goto out_unlock;
- }
-
- if (len == 0)
- len = isize - pos_in;
-
- /* Ensure offsets don't wrap and the input is inside i_size */
- if (pos_in + len < pos_in || pos_out + len < pos_out ||
- pos_in + len > isize)
- goto out_unlock;
-
- /* Don't allow dedupe past EOF in the dest file */
- if (is_dedupe) {
- loff_t disize;
-
- disize = i_size_read(inode_out);
- if (pos_out >= disize || pos_out + len > disize)
- goto out_unlock;
- }
-
- /* If we're linking to EOF, continue to the block boundary. */
- if (pos_in + len == isize)
- blen = ALIGN(isize, bs) - pos_in;
- else
- blen = len;
-
- /* Only reflink if we're aligned to block boundaries */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
- !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
- goto out_unlock;
-
- /* Don't allow overlapped reflink within the same file */
- if (same_inode) {
- if (pos_out + blen > pos_in && pos_out < pos_in + blen)
- goto out_unlock;
- }
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + len - 1);
- if (ret)
- goto out_unlock;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + len - 1);
- if (ret)
+ ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+ &len, is_dedupe);
+ if (ret <= 0)
goto out_unlock;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
- /*
- * Check that the extents are the same.
- */
- if (is_dedupe) {
- bool is_same = false;
-
- ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
- len, &is_same);
- if (ret)
- goto out_unlock;
- if (!is_same) {
- ret = -EBADE;
- goto out_unlock;
- }
- }
-
+ /* Set flags and remap blocks. */
ret = xfs_reflink_set_inode_flag(src, dest);
if (ret)
goto out_unlock;
- /*
- * Invalidate the page cache so that we can clear any CoW mappings
- * in the destination file.
- */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
@@ -1431,6 +1179,10 @@ xfs_reflink_remap_range(
if (ret)
goto out_unlock;
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
@@ -1447,11 +1199,9 @@ xfs_reflink_remap_range(
out_unlock:
xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(src, XFS_IOLOCK_EXCL);
- if (src->i_ino != dest->i_ino) {
+ if (!same_inode)
xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(dest, XFS_IOLOCK_EXCL);
- }
+ unlock_two_nondirectories(inode_in, inode_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
@@ -1697,37 +1447,3 @@ out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
}
-
-/*
- * Does this inode have any real CoW reservations?
- */
-bool
-xfs_reflink_has_real_cow_blocks(
- struct xfs_inode *ip)
-{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
- xfs_extnum_t idx;
-
- if (!xfs_is_reflink_inode(ip))
- return false;
-
- /* Go find the old extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
- while (gotp) {
- xfs_bmbt_get_all(gotp, &irec);
-
- if (!isnullstartblock(irec.br_startblock))
- return true;
-
- /* Roll on... */
- idx++;
- if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- break;
- gotp = xfs_iext_get_ext(ifp, idx);
- }
-
- return false;
-}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index fad11607c9ad..aa6a4d64bd35 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
xfs_off_t offset, xfs_off_t count);
extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
- struct xfs_bmbt_irec *imap, bool *need_alloc);
-extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap);
+extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
@@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
-extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
-
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 12d48cd8f8a4..f11282c96887 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
- xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
- xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
}
len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
@@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
for_each_possible_cpu(c) {
preempt_disable();
/* save vn_active, it's a universal truth! */
- vn_active = per_cpu_ptr(stats, c)->vn_active;
+ vn_active = per_cpu_ptr(stats, c)->s.vn_active;
memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
- per_cpu_ptr(stats, c)->vn_active = vn_active;
+ per_cpu_ptr(stats, c)->s.vn_active = vn_active;
preempt_enable();
}
}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 79ad2e69fc33..375840f5a99a 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -22,9 +22,37 @@
#include <linux/percpu.h>
/*
+ * The btree stats arrays have fixed offsets for the different stats. We
+ * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and
+ * that allows us to use fixed offsets into the stats array for each btree
+ * stat. These index offsets are defined in the order they will be emitted
+ * in the stats files, so it is possible to add new btree stat types by
+ * appending to the enum list below.
+ */
+enum {
+ __XBTS_lookup = 0,
+ __XBTS_compare = 1,
+ __XBTS_insrec = 2,
+ __XBTS_delrec = 3,
+ __XBTS_newroot = 4,
+ __XBTS_killroot = 5,
+ __XBTS_increment = 6,
+ __XBTS_decrement = 7,
+ __XBTS_lshift = 8,
+ __XBTS_rshift = 9,
+ __XBTS_split = 10,
+ __XBTS_join = 11,
+ __XBTS_alloc = 12,
+ __XBTS_free = 13,
+ __XBTS_moves = 14,
+
+ __XBTS_MAX = 15,
+};
+
+/*
* XFS global statistics
*/
-struct xfsstats {
+struct __xfsstats {
# define XFSSTAT_END_EXTENT_ALLOC 4
__uint32_t xs_allocx;
__uint32_t xs_allocb;
@@ -117,118 +145,20 @@ struct xfsstats {
__uint32_t xb_page_found;
__uint32_t xb_get_read;
/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15)
- __uint32_t xs_abtb_2_lookup;
- __uint32_t xs_abtb_2_compare;
- __uint32_t xs_abtb_2_insrec;
- __uint32_t xs_abtb_2_delrec;
- __uint32_t xs_abtb_2_newroot;
- __uint32_t xs_abtb_2_killroot;
- __uint32_t xs_abtb_2_increment;
- __uint32_t xs_abtb_2_decrement;
- __uint32_t xs_abtb_2_lshift;
- __uint32_t xs_abtb_2_rshift;
- __uint32_t xs_abtb_2_split;
- __uint32_t xs_abtb_2_join;
- __uint32_t xs_abtb_2_alloc;
- __uint32_t xs_abtb_2_free;
- __uint32_t xs_abtb_2_moves;
-#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15)
- __uint32_t xs_abtc_2_lookup;
- __uint32_t xs_abtc_2_compare;
- __uint32_t xs_abtc_2_insrec;
- __uint32_t xs_abtc_2_delrec;
- __uint32_t xs_abtc_2_newroot;
- __uint32_t xs_abtc_2_killroot;
- __uint32_t xs_abtc_2_increment;
- __uint32_t xs_abtc_2_decrement;
- __uint32_t xs_abtc_2_lshift;
- __uint32_t xs_abtc_2_rshift;
- __uint32_t xs_abtc_2_split;
- __uint32_t xs_abtc_2_join;
- __uint32_t xs_abtc_2_alloc;
- __uint32_t xs_abtc_2_free;
- __uint32_t xs_abtc_2_moves;
-#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15)
- __uint32_t xs_bmbt_2_lookup;
- __uint32_t xs_bmbt_2_compare;
- __uint32_t xs_bmbt_2_insrec;
- __uint32_t xs_bmbt_2_delrec;
- __uint32_t xs_bmbt_2_newroot;
- __uint32_t xs_bmbt_2_killroot;
- __uint32_t xs_bmbt_2_increment;
- __uint32_t xs_bmbt_2_decrement;
- __uint32_t xs_bmbt_2_lshift;
- __uint32_t xs_bmbt_2_rshift;
- __uint32_t xs_bmbt_2_split;
- __uint32_t xs_bmbt_2_join;
- __uint32_t xs_bmbt_2_alloc;
- __uint32_t xs_bmbt_2_free;
- __uint32_t xs_bmbt_2_moves;
-#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15)
- __uint32_t xs_ibt_2_lookup;
- __uint32_t xs_ibt_2_compare;
- __uint32_t xs_ibt_2_insrec;
- __uint32_t xs_ibt_2_delrec;
- __uint32_t xs_ibt_2_newroot;
- __uint32_t xs_ibt_2_killroot;
- __uint32_t xs_ibt_2_increment;
- __uint32_t xs_ibt_2_decrement;
- __uint32_t xs_ibt_2_lshift;
- __uint32_t xs_ibt_2_rshift;
- __uint32_t xs_ibt_2_split;
- __uint32_t xs_ibt_2_join;
- __uint32_t xs_ibt_2_alloc;
- __uint32_t xs_ibt_2_free;
- __uint32_t xs_ibt_2_moves;
-#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15)
- __uint32_t xs_fibt_2_lookup;
- __uint32_t xs_fibt_2_compare;
- __uint32_t xs_fibt_2_insrec;
- __uint32_t xs_fibt_2_delrec;
- __uint32_t xs_fibt_2_newroot;
- __uint32_t xs_fibt_2_killroot;
- __uint32_t xs_fibt_2_increment;
- __uint32_t xs_fibt_2_decrement;
- __uint32_t xs_fibt_2_lshift;
- __uint32_t xs_fibt_2_rshift;
- __uint32_t xs_fibt_2_split;
- __uint32_t xs_fibt_2_join;
- __uint32_t xs_fibt_2_alloc;
- __uint32_t xs_fibt_2_free;
- __uint32_t xs_fibt_2_moves;
-#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15)
- __uint32_t xs_rmap_2_lookup;
- __uint32_t xs_rmap_2_compare;
- __uint32_t xs_rmap_2_insrec;
- __uint32_t xs_rmap_2_delrec;
- __uint32_t xs_rmap_2_newroot;
- __uint32_t xs_rmap_2_killroot;
- __uint32_t xs_rmap_2_increment;
- __uint32_t xs_rmap_2_decrement;
- __uint32_t xs_rmap_2_lshift;
- __uint32_t xs_rmap_2_rshift;
- __uint32_t xs_rmap_2_split;
- __uint32_t xs_rmap_2_join;
- __uint32_t xs_rmap_2_alloc;
- __uint32_t xs_rmap_2_free;
- __uint32_t xs_rmap_2_moves;
-#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15)
- __uint32_t xs_refcbt_2_lookup;
- __uint32_t xs_refcbt_2_compare;
- __uint32_t xs_refcbt_2_insrec;
- __uint32_t xs_refcbt_2_delrec;
- __uint32_t xs_refcbt_2_newroot;
- __uint32_t xs_refcbt_2_killroot;
- __uint32_t xs_refcbt_2_increment;
- __uint32_t xs_refcbt_2_decrement;
- __uint32_t xs_refcbt_2_lshift;
- __uint32_t xs_refcbt_2_rshift;
- __uint32_t xs_refcbt_2_split;
- __uint32_t xs_refcbt_2_join;
- __uint32_t xs_refcbt_2_alloc;
- __uint32_t xs_refcbt_2_free;
- __uint32_t xs_refcbt_2_moves;
+#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
+ __uint32_t xs_abtb_2[__XBTS_MAX];
+#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
+ __uint32_t xs_abtc_2[__XBTS_MAX];
+#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
+ __uint32_t xs_bmbt_2[__XBTS_MAX];
+#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
+ __uint32_t xs_ibt_2[__XBTS_MAX];
+#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
+ __uint32_t xs_fibt_2[__XBTS_MAX];
+#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
+ __uint32_t xs_rmap_2[__XBTS_MAX];
+#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
+ __uint32_t xs_refcbt_2[__XBTS_MAX];
#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
@@ -245,26 +175,58 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
+struct xfsstats {
+ union {
+ struct __xfsstats s;
+ uint32_t a[XFSSTAT_END_XQMSTAT];
+ };
+};
+
+/*
+ * simple wrapper for getting the array index of s struct member offset
+ */
+#define XFS_STATS_CALC_INDEX(member) \
+ (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t))
+
+
int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
void xfs_stats_clearall(struct xfsstats __percpu *stats);
extern struct xstats xfsstats;
#define XFS_STATS_INC(mp, v) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \
} while (0)
#define XFS_STATS_DEC(mp, v) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \
} while (0)
#define XFS_STATS_ADD(mp, v, inc) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \
+} while (0)
+
+#define XFS_STATS_INC_OFF(mp, off) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \
+} while (0)
+
+#define XFS_STATS_DEC_OFF(mp, off) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \
+} while (0)
+
+#define XFS_STATS_ADD_OFF(mp, off, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \
} while (0)
#if defined(CONFIG_PROC_FS)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ade4691e3f74..eecbaac08eba 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -104,9 +104,6 @@ static const match_table_t tokens = {
{Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
{Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
{Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
- {Opt_barrier, "barrier"}, /* use writer barriers for log write and
- * unwritten extent conversion */
- {Opt_nobarrier, "nobarrier"}, /* .. disable */
{Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
{Opt_inode32, "inode32"}, /* inode allocation limited to
* XFS_MAXINUMBER_32 */
@@ -134,6 +131,12 @@ static const match_table_t tokens = {
{Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
{Opt_dax, "dax"}, /* Enable direct access to bdev pages */
+
+ /* Deprecated mount options scheduled for removal */
+ {Opt_barrier, "barrier"}, /* use writer barriers for log write and
+ * unwritten extent conversion */
+ {Opt_nobarrier, "nobarrier"}, /* .. disable */
+
{Opt_err, NULL},
};
@@ -301,12 +304,6 @@ xfs_parseargs(
case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
break;
- case Opt_barrier:
- mp->m_flags |= XFS_MOUNT_BARRIER;
- break;
- case Opt_nobarrier:
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- break;
case Opt_ikeep:
mp->m_flags |= XFS_MOUNT_IKEEP;
break;
@@ -374,6 +371,14 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DAX;
break;
#endif
+ case Opt_barrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ break;
+ case Opt_nobarrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+ break;
default:
xfs_warn(mp, "unknown mount option [%s].", p);
return -EINVAL;
@@ -943,7 +948,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
@@ -1238,9 +1243,11 @@ xfs_fs_remount(
token = match_token(p, tokens, args);
switch (token) {
case Opt_barrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
mp->m_flags |= XFS_MOUNT_BARRIER;
break;
case Opt_nobarrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
case Opt_inode64:
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 58142aeeeea6..f2cb45ed1d54 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -238,8 +238,7 @@ xfs_symlink(
if (error)
goto out_release_inode;
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
- XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
@@ -287,7 +286,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
/*
@@ -412,7 +411,7 @@ out_release_inode:
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 276d3023d60f..de6195e38910 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -396,7 +396,7 @@ max_retries_show(
int retries;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
- if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+ if (cfg->max_retries == XFS_ERR_RETRY_FOREVER)
retries = -1;
else
retries = cfg->max_retries;
@@ -422,7 +422,7 @@ max_retries_store(
return -EINVAL;
if (val == -1)
- cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+ cfg->max_retries = XFS_ERR_RETRY_FOREVER;
else
cfg->max_retries = val;
return count;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0907752be62d..69c5bcd9a51b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
DEFINE_BUF_EVENT(xfs_buf_iodone);
DEFINE_BUF_EVENT(xfs_buf_submit);
DEFINE_BUF_EVENT(xfs_buf_submit_wait);
-DEFINE_BUF_EVENT(xfs_buf_bawrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
@@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
-DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
DEFINE_BUF_EVENT(xfs_reset_dqcounts);
-DEFINE_BUF_EVENT(xfs_inode_item_push);
/* pass flags explicitly */
DECLARE_EVENT_CLASS(xfs_buf_flags_class,
@@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
DECLARE_EVENT_CLASS(xfs_filestream_class,
TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
@@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_evict_inode);
DEFINE_INODE_EVENT(xfs_update_time);
DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post,
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
TP_ARGS(ip, caller_ip))
-DEFINE_IREF_EVENT(xfs_ihold);
DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
@@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss);
DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
DEFINE_DQUOT_EVENT(xfs_dqget_dup);
DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_wait);
DEFINE_DQUOT_EVENT(xfs_dqput_free);
DEFINE_DQUOT_EVENT(xfs_dqrele);
DEFINE_DQUOT_EVENT(xfs_dqflush);
@@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
DEFINE_ATTR_EVENT(xfs_attr_sf_create);
DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
-DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
@@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
-DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
DEFINE_ATTR_EVENT(xfs_attr_node_removename);
@@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done);
DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
@@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \
struct xfs_inode *dest, xfs_off_t doffset), \
TP_ARGS(src, soffset, len, dest, doffset))
-/* two-file vfs io tracepoint class */
-DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
- TP_PROTO(struct inode *src, u64 soffset, u64 len,
- struct inode *dest, u64 doffset),
- TP_ARGS(src, soffset, len, dest, doffset),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(unsigned long, src_ino)
- __field(loff_t, src_isize)
- __field(loff_t, src_offset)
- __field(size_t, len)
- __field(unsigned long, dest_ino)
- __field(loff_t, dest_isize)
- __field(loff_t, dest_offset)
- ),
- TP_fast_assign(
- __entry->dev = src->i_sb->s_dev;
- __entry->src_ino = src->i_ino;
- __entry->src_isize = i_size_read(src);
- __entry->src_offset = soffset;
- __entry->len = len;
- __entry->dest_ino = dest->i_ino;
- __entry->dest_isize = i_size_read(dest);
- __entry->dest_offset = doffset;
- ),
- TP_printk("dev %d:%d count %zd "
- "ino 0x%lx isize 0x%llx offset 0x%llx -> "
- "ino 0x%lx isize 0x%llx offset 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->len,
- __entry->src_ino,
- __entry->src_isize,
- __entry->src_offset,
- __entry->dest_ino,
- __entry->dest_isize,
- __entry->dest_offset)
-)
-
-#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \
-DEFINE_EVENT(xfs_double_vfs_io_class, name, \
- TP_PROTO(struct inode *src, u64 soffset, u64 len, \
- struct inode *dest, u64 doffset), \
- TP_ARGS(src, soffset, len, dest, doffset))
-
-/* CoW write tracepoint */
-DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
- xfs_extlen_t len, xfs_fsblock_t new_pblk),
- TP_ARGS(ip, lblk, pblk, len, new_pblk),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_fsblock_t, pblk)
- __field(xfs_extlen_t, len)
- __field(xfs_fsblock_t, new_pblk)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->pblk = pblk;
- __entry->len = len;
- __entry->new_pblk = new_pblk;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
- "len 0x%x new_pblk %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->pblk,
- __entry->len,
- __entry->new_pblk)
-)
-
-#define DEFINE_COW_EVENT(name) \
-DEFINE_EVENT(xfs_copy_on_write_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
- xfs_extlen_t len, xfs_fsblock_t new_pblk), \
- TP_ARGS(ip, lblk, pblk, len, new_pblk))
-
/* inode/irec events */
DECLARE_EVENT_CLASS(xfs_inode_irec_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
@@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
@@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
/* ioctl tracepoints */
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
TRACE_EVENT(xfs_ioctl_clone,
TP_PROTO(struct inode *src, struct inode *dest),
TP_ARGS(src, dest),
@@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone,
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
-DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
@@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
-DEFINE_COW_EVENT(xfs_reflink_fork_buf);
-DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
-DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
/* rmap swapext tracepoints */
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 62900938f26d..0594db435972 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
NULL
};
-static int
+static void
__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
char *prefix,
@@ -148,7 +148,7 @@ __xfs_xattr_put_listent(
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
context->seen_enough = 1;
- return 0;
+ return;
}
offset = (char *)context->alist + context->count;
strncpy(offset, prefix, prefix_len);
@@ -159,10 +159,10 @@ __xfs_xattr_put_listent(
compute_size:
context->count += prefix_len + namelen + 1;
- return 0;
+ return;
}
-static int
+static void
xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
@@ -180,23 +180,19 @@ xfs_xattr_put_listent(
if (namelen == SGI_ACL_FILE_SIZE &&
strncmp(name, SGI_ACL_FILE,
SGI_ACL_FILE_SIZE) == 0) {
- int ret = __xfs_xattr_put_listent(
+ __xfs_xattr_put_listent(
context, XATTR_SYSTEM_PREFIX,
XATTR_SYSTEM_PREFIX_LEN,
XATTR_POSIX_ACL_ACCESS,
strlen(XATTR_POSIX_ACL_ACCESS));
- if (ret)
- return ret;
} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
strncmp(name, SGI_ACL_DEFAULT,
SGI_ACL_DEFAULT_SIZE) == 0) {
- int ret = __xfs_xattr_put_listent(
+ __xfs_xattr_put_listent(
context, XATTR_SYSTEM_PREFIX,
XATTR_SYSTEM_PREFIX_LEN,
XATTR_POSIX_ACL_DEFAULT,
strlen(XATTR_POSIX_ACL_DEFAULT));
- if (ret)
- return ret;
}
#endif
@@ -205,7 +201,7 @@ xfs_xattr_put_listent(
* see them.
*/
if (!capable(CAP_SYS_ADMIN))
- return 0;
+ return;
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
@@ -217,8 +213,9 @@ xfs_xattr_put_listent(
prefix_len = XATTR_USER_PREFIX_LEN;
}
- return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
- namelen);
+ __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
+ return;
}
ssize_t