summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c351
1 files changed, 312 insertions, 39 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..84f08c976ac4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,8 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -150,7 +152,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
@@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
- bool *drained_dio)
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@@ -431,7 +434,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- unsigned int *iolock)
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@@ -481,7 +485,7 @@ restart:
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
- &drained_dio);
+ &drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@@ -491,6 +495,48 @@ restart:
return kiocb_modified(iocb);
}
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
+ /*
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
+ *
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
+ */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(ip,
+ XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
+}
+
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -503,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_IOEND_DIRECT);
+ xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@@ -597,7 +683,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@@ -611,11 +697,31 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, NULL, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
- if (iolock)
- xfs_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
@@ -675,7 +781,7 @@ retry_exclusive:
goto out_unlock;
}
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -721,9 +827,21 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+
+ /*
+ * For always COW inodes we also must check the alignment of each
+ * individual iovec segment, as they could end up with different
+ * I/Os due to the way bio_iov_iter_get_pages works, and we'd
+ * then overwrite an already written block.
+ */
+ if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
+ (xfs_is_always_cow_inode(ip) &&
+ (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
- return xfs_file_dio_write_aligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
@@ -740,7 +858,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -784,7 +902,7 @@ write_retry:
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -832,6 +950,67 @@ out:
}
STATIC ssize_t
+xfs_file_buffered_write_zoned(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
+STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
@@ -878,6 +1057,8 @@ xfs_file_write_iter(
return ret;
}
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@@ -932,7 +1113,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@@ -948,7 +1130,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
- error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1004,7 +1186,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@@ -1017,7 +1200,7 @@ xfs_falloc_zero_range(
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len);
+ error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range(
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
-xfs_file_fallocate(
+__xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
- if (mode & ~XFS_FALLOC_FL_SUPPORTED)
- return -EOPNOTSUPP;
-
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@@ -1124,16 +1303,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
- error = xfs_falloc_collapse_range(file, offset, len);
+ error = xfs_falloc_collapse_range(file, offset, len, ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
- error = xfs_falloc_zero_range(file, mode, offset, len);
+ error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1154,6 +1333,54 @@ out_unlock:
return error;
}
+static long
+xfs_file_zoned_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int error;
+
+ error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ error = __xfs_file_fallocate(file, mode, offset, len, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
+ return error;
+}
+
+static long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(XFS_I(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE)))
+ return xfs_file_zoned_fallocate(file, mode, offset, len);
+ return __xfs_file_fallocate(file, mode, offset, len, NULL);
+}
+
STATIC int
xfs_file_fadvise(
struct file *file,
@@ -1347,15 +1574,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
+ * Inodes on the zoned RT device never have preallocations, so skip
+ * taking the locks below.
+ */
+ if (!inode->i_nlink ||
+ !(file->f_mode & FMODE_WRITE) ||
+ (ip->i_diflags & XFS_DIFLAG_APPEND) ||
+ xfs_is_zoned_inode(ip))
+ return 0;
+
+ /*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
- if (inode->i_nlink &&
- (file->f_mode & FMODE_WRITE) &&
- !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
- !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
@@ -1469,9 +1703,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
-xfs_write_fault(
+__xfs_write_fault(
struct vm_fault *vmf,
- unsigned int order)
+ unsigned int order,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1498,13 +1733,50 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
+static vm_fault_t
+xfs_write_fault_zoned(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ unsigned int len = folio_size(page_folio(vmf->page));
+ struct xfs_zone_alloc_ctx ac = { };
+ int error;
+ vm_fault_t ret;
+
+ /*
+ * This could over-allocate as it doesn't check for truncation.
+ *
+ * But as the overallocation is limited to less than a folio and will be
+ * release instantly that's just fine.
+ */
+ error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
+ &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ ret = __xfs_write_fault(vmf, order, &ac);
+ xfs_zoned_space_unreserve(ip, &ac);
+ return ret;
+}
+
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
+ return xfs_write_fault_zoned(vmf, order);
+ return __xfs_write_fault(vmf, order, NULL);
+}
+
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
@@ -1613,7 +1885,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {