summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h24
-rw-r--r--fs/ext4/file.c50
-rw-r--r--fs/ext4/indirect.c123
-rw-r--r--fs/ext4/inode.c166
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/super.c124
6 files changed, 359 insertions, 138 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..f63c3d5805c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -364,7 +364,8 @@ struct flex_groups {
#define EXT4_DIRTY_FL 0x00000100
#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
-#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
+ /* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
@@ -421,7 +422,7 @@ enum {
EXT4_INODE_DIRTY = 8,
EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
EXT4_INODE_NOCOMPR = 10, /* Don't compress */
- EXT4_INODE_ECOMPR = 11, /* Compression error */
+ EXT4_INODE_ENCRYPT = 11, /* Compression error */
/* End compression flags --- maybe not all used */
EXT4_INODE_INDEX = 12, /* hash-indexed directory */
EXT4_INODE_IMAGIC = 13, /* AFS directory */
@@ -466,7 +467,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(DIRTY);
CHECK_FLAG_VALUE(COMPRBLK);
CHECK_FLAG_VALUE(NOCOMPR);
- CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(ENCRYPT);
CHECK_FLAG_VALUE(INDEX);
CHECK_FLAG_VALUE(IMAGIC);
CHECK_FLAG_VALUE(JOURNAL_DATA);
@@ -965,6 +966,11 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX 0
+#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -1043,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID 0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
+
/*
* Structure of the super block
*/
@@ -1156,7 +1168,8 @@ struct ext4_super_block {
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
- __le32 s_reserved[106]; /* Padding to the end of the block */
+ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
+ __le32 s_reserved[105]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1522,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
* GDT_CSUM bits are mutually exclusive.
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1537,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -2578,6 +2593,7 @@ extern const struct file_operations ext4_dir_operations;
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int o_direct = file->f_flags & O_DIRECT;
+ int o_direct = io_is_direct(file);
int overwrite = 0;
size_t length = iov_iter_count(from);
ssize_t ret;
@@ -191,17 +191,41 @@ errout:
return ret;
}
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_fault(vma, vmf, ext4_get_block);
+ /* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+ .fault = ext4_dax_fault,
+ .page_mkwrite = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
+
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
+ if (IS_DAX(file_inode(file))) {
+ vma->vm_ops = &ext4_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ } else {
+ vma->vm_ops = &ext4_file_vm_ops;
+ }
return 0;
}
@@ -600,6 +624,26 @@ const struct file_operations ext4_file_operations = {
.fallocate = ext4_fallocate,
};
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+ .llseek = ext4_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .mmap = ext4_file_mmap,
+ .open = ext4_file_open,
+ .release = ext4_release_file,
+ .fsync = ext4_sync_file,
+ /* Splice not yet supported with DAX */
+ .fallocate = ext4_fallocate,
+};
+#endif
+
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..45fe924f82bc 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
inode_dio_done(inode);
goto locked;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter, offset,
- ext4_get_block, NULL, NULL, 0);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, 0);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iter,
- offset, ext4_get_block);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, DIO_LOCKING);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
@@ -1393,10 +1401,7 @@ end_range:
* to free. Everything was covered by the start
* of the range.
*/
- return 0;
- } else {
- /* Shared branch grows from an indirect block */
- partial2--;
+ goto do_indirects;
}
} else {
/*
@@ -1427,56 +1432,96 @@ end_range:
/* Punch happened within the same level (n == n2) */
partial = ext4_find_shared(inode, n, offsets, chain, &nr);
partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
- /*
- * ext4_find_shared returns Indirect structure which
- * points to the last element which should not be
- * removed by truncate. But this is end of the range
- * in punch_hole so we need to point to the next element
- */
- partial2->p++;
- while ((partial > chain) || (partial2 > chain2)) {
- /* We're at the same block, so we're almost finished */
- if ((partial->bh && partial2->bh) &&
- (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
- if ((partial > chain) && (partial2 > chain2)) {
+
+ /* Free top, but only if partial2 isn't its subtree. */
+ if (nr) {
+ int level = min(partial - chain, partial2 - chain2);
+ int i;
+ int subtree = 1;
+
+ for (i = 0; i <= level; i++) {
+ if (offsets[i] != offsets2[i]) {
+ subtree = 0;
+ break;
+ }
+ }
+
+ if (!subtree) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1,
+ (chain+n-1) - partial);
+ *partial->p = 0;
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
ext4_free_branches(handle, inode, partial->bh,
- partial->p + 1,
- partial2->p,
+ partial->p,
+ partial->p+1,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
}
- return 0;
}
+ }
+
+ if (!nr2) {
/*
- * Clear the ends of indirect blocks on the shared branch
- * at the start of the range
+ * ext4_find_shared returns Indirect structure which
+ * points to the last element which should not be
+ * removed by truncate. But this is end of the range
+ * in punch_hole so we need to point to the next element
*/
- if (partial > chain) {
+ partial2->p++;
+ }
+
+ while (partial > chain || partial2 > chain2) {
+ int depth = (chain+n-1) - partial;
+ int depth2 = (chain2+n2-1) - partial2;
+
+ if (partial > chain && partial2 > chain2 &&
+ partial->bh->b_blocknr == partial2->bh->b_blocknr) {
+ /*
+ * We've converged on the same block. Clear the range,
+ * then we're done.
+ */
ext4_free_branches(handle, inode, partial->bh,
- partial->p + 1,
- (__le32 *)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
+ partial->p + 1,
+ partial2->p,
+ (chain+n-1) - partial);
BUFFER_TRACE(partial->bh, "call brelse");
brelse(partial->bh);
- partial--;
+ BUFFER_TRACE(partial2->bh, "call brelse");
+ brelse(partial2->bh);
+ return 0;
}
+
/*
- * Clear the ends of indirect blocks on the shared branch
- * at the end of the range
+ * The start and end partial branches may not be at the same
+ * level even though the punch happened within one level. So, we
+ * give them a chance to arrive at the same level, then walk
+ * them in step with each other until we converge on the same
+ * block.
*/
- if (partial2 > chain2) {
+ if (partial > chain && depth <= depth2) {
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p + 1,
+ (__le32 *)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+ if (partial2 > chain2 && depth2 <= depth) {
ext4_free_branches(handle, inode, partial2->bh,
(__le32 *)partial2->bh->b_data,
partial2->p,
- (chain2+n-1) - partial2);
+ (chain2+n2-1) - partial2);
BUFFER_TRACE(partial2->bh, "call brelse");
brelse(partial2->bh);
partial2--;
}
}
+ return 0;
do_indirects:
/* Kill the remaining (whole) subtrees */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..5cb9a212b86f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
return retval;
}
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ bh->b_assoc_map = inode->i_mapping;
+ bh->b_private = (void *)(unsigned long)iblock;
+ bh->b_end_io = ext4_end_io_unwritten;
+ }
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -1007,6 +1024,7 @@ static int ext4_write_end(struct file *file,
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
@@ -1037,6 +1055,8 @@ static int ext4_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
@@ -1078,6 +1098,7 @@ static int ext4_journalled_write_end(struct file *file,
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
@@ -1110,6 +1131,9 @@ static int ext4_journalled_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
+
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
@@ -3010,13 +3034,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter,
- offset,
- get_block_func,
- ext4_end_io_dio,
- NULL,
- dio_flags);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+ ext4_end_io_dio, dio_flags);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ get_block_func,
+ ext4_end_io_dio, NULL, dio_flags);
/*
* Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3205,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
+static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, max, pos;
+ unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3205,14 +3223,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- max = blocksize - (offset & (blocksize - 1));
-
- /*
- * correct length if it does not fall between
- * 'from' and the end of the block
- */
- if (length > max || length < 0)
- length = max;
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -3278,6 +3288,33 @@ unlock:
}
/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
+ struct inode *inode = mapping->host;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ unsigned max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ if (IS_DAX(inode))
+ return dax_zero_page_range(inode, from, length, ext4_get_block);
+ return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
* up to the end of the block which corresponds to `from'.
* This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3835,10 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
+ if (test_opt(inode->i_sb, DAX))
+ new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4091,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4181,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
return 0;
}
+struct other_inode {
+ unsigned long orig_ino;
+ struct ext4_inode *raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+ void *data)
+{
+ struct other_inode *oi = (struct other_inode *) data;
+
+ if ((inode->i_ino != ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ ((inode->i_state & I_DIRTY_TIME) == 0))
+ return 0;
+ spin_lock(&inode->i_lock);
+ if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ (inode->i_state & I_DIRTY_TIME)) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ spin_unlock(&inode->i_lock);
+
+ spin_lock(&ei->i_raw_lock);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+ ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+ trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+ return -1;
+ }
+ spin_unlock(&inode->i_lock);
+ return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+ unsigned long orig_ino, char *buf)
+{
+ struct other_inode oi;
+ unsigned long ino;
+ int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int inode_size = EXT4_INODE_SIZE(sb);
+
+ oi.orig_ino = orig_ino;
+ ino = orig_ino & ~(inodes_per_block - 1);
+ for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+ if (ino == orig_ino)
+ continue;
+ oi.raw_inode = (struct ext4_inode *) buf;
+ (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ }
+}
+
/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
@@ -4248,10 +4349,11 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
-
ext4_inode_csum_set(inode, raw_inode, ei);
-
spin_unlock(&ei->i_raw_lock);
+ if (inode->i_sb->s_flags & MS_LAZYTIME)
+ ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+ bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4636,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
- truncate_pagecache(inode, inode->i_size);
+ truncate_pagecache(inode, inode->i_size);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4942,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything. If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
+ if (flags == I_DIRTY_TIME)
+ return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..e061e66c8280 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
static int block_device_ejected(struct super_block *sb)
{
struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
return bdi->dev == NULL;
}
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id);
static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
};
-
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
- .quota_on_meta = ext4_quota_on_sysfile,
- .quota_off = ext4_quota_off_sysfile,
- .quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
#endif
static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
+ {Opt_dax, "dax"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_removed, "mblk_io_submit"},
{Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_i_version:
sb->s_flags |= MS_I_VERSION;
return 1;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ return 1;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
+#ifndef CONFIG_FS_DAX
+ } else if (token == Opt_dax) {
+ ext4_msg(sb, KERN_INFO, "dax option not supported");
+ return -1;
+#endif
} else {
if (!args->from)
arg = 1;
@@ -2776,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (readonly)
return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ ext4_msg(sb, KERN_INFO, "filesystem is read-only");
+ sb->s_flags |= MS_RDONLY;
+ return 1;
+ }
+
/* Check that feature set is OK for a read-write mount */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
@@ -3602,6 +3611,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and dioread_nolock");
goto failed_mount;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ goto failed_mount;
+ }
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
}
@@ -3665,6 +3679,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "error: unsupported blocksize for dax");
+ goto failed_mount;
+ }
+ if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ ext4_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ goto failed_mount;
+ }
+ }
+
if (sb->s_blocksize != blocksize) {
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
@@ -3915,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- init_timer(&sbi->s_err_report);
- sbi->s_err_report.function = print_daily_error_info;
- sbi->s_err_report.data = (unsigned long) sb;
+ setup_timer(&sbi->s_err_report, print_daily_error_info,
+ (unsigned long) sb);
/* Register extent status tree shrinker */
if (ext4_es_register_shrinker(sbi))
@@ -3935,7 +3961,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- sb->s_qcop = &ext4_qctl_sysfile_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4845,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
- /*
- * Allow the "check" option to be passed as a remount option.
- */
if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
@@ -4856,17 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
- "during remount not supported");
- err = -EINVAL;
- goto restore_opts;
- }
-
- if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
- test_opt(sb, JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "changing journal_checksum "
- "during remount not supported");
- err = -EINVAL;
- goto restore_opts;
+ "during remount not supported; ignoring");
+ sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
}
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -4882,6 +4896,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+ ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+ "dax flag with busy inodes while remounting");
+ sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
}
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -4930,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
} else {
/* Make sure we can mount this feature set readwrite */
- if (!ext4_feature_set_ok(sb, 0)) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ !ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
@@ -5020,6 +5048,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
#endif
+ *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
@@ -5288,21 +5317,6 @@ static int ext4_enable_quotas(struct super_block *sb)
return 0;
}
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /*
- * USAGE was enabled at mount time. Only need to enable LIMITS now.
- */
- return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
-
static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5343,6 @@ out:
return dquot_quota_off(sb, type);
}
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /* Disable only the limits. */
- return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)