summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Brandenburg <martin@omnibond.com>2019-02-12 20:19:06 +0000
committerMike Marshall <hubcap@omnibond.com>2019-05-03 14:32:39 -0400
commit8f04e1be784858ba0288c7c09b9de06627a800c9 (patch)
treebf777bf20b50e8904de6be0f63f452b4246a9909
parentc472ebc25555e634d89e1ed508d37c9102bff017 (diff)
downloadlwn-8f04e1be784858ba0288c7c09b9de06627a800c9.tar.gz
lwn-8f04e1be784858ba0288c7c09b9de06627a800c9.zip
orangefs: add orangefs_revalidate_mapping
This is modeled after NFS, except our method is different. We use a simple timer to determine whether to invalidate the page cache. This is bound to perform. This addes a sysfs parameter cache_timeout_msecs which controls the time between page cache invalidations. Signed-off-by: Martin Brandenburg <martin@omnibond.com> Signed-off-by: Mike Marshall <hubcap@omnibond.com>
-rw-r--r--fs/orangefs/file.c70
-rw-r--r--fs/orangefs/inode.c250
-rw-r--r--fs/orangefs/orangefs-kernel.h4
-rw-r--r--fs/orangefs/orangefs-mod.c1
-rw-r--r--fs/orangefs/orangefs-sysfs.c22
5 files changed, 328 insertions, 19 deletions
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 405449ce4b02..faa5b61cdfd6 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -241,18 +241,78 @@ out:
return ret;
}
+int orangefs_revalidate_mapping(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long *bitlock = &orangefs_inode->bitlock;
+ int ret;
+
+ while (1) {
+ ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
+ if (ret)
+ return ret;
+ spin_lock(&inode->i_lock);
+ if (test_bit(1, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (!time_before(jiffies, orangefs_inode->mapping_time))
+ break;
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
+
+ set_bit(1, bitlock);
+ smp_wmb();
+ spin_unlock(&inode->i_lock);
+
+ unmap_mapping_range(mapping, 0, 0, 0);
+ ret = filemap_write_and_wait(mapping);
+ if (!ret)
+ ret = invalidate_inode_pages2(mapping);
+
+ orangefs_inode->mapping_time = jiffies +
+ orangefs_cache_timeout_msecs*HZ/1000;
+
+ clear_bit(1, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, 1);
+
+ return ret;
+}
+
static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
+ int ret;
orangefs_stats.reads++;
- return generic_file_read_iter(iocb, iter);
+
+ down_read(&file_inode(iocb->ki_filp)->i_rwsem);
+ ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+ if (ret)
+ goto out;
+
+ ret = generic_file_read_iter(iocb, iter);
+out:
+ up_read(&file_inode(iocb->ki_filp)->i_rwsem);
+ return ret;
}
static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
+ int ret;
orangefs_stats.writes++;
- return generic_file_write_iter(iocb, iter);
+
+ if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
+ ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+ if (ret)
+ return ret;
+ }
+
+ ret = generic_file_write_iter(iocb, iter);
+ return ret;
}
/*
@@ -341,6 +401,12 @@ static const struct vm_operations_struct orangefs_file_vm_ops = {
*/
static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ int ret;
+
+ ret = orangefs_revalidate_mapping(file_inode(file));
+ if (ret)
+ return ret;
+
gossip_debug(GOSSIP_FILE_DEBUG,
"orangefs_file_mmap: called on %s\n",
(file ?
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index add9c569a7dc..7ed2ea093c4e 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -31,6 +31,7 @@ static int orangefs_writepage_locked(struct page *page,
len = i_size_read(inode);
if (PagePrivate(page)) {
wr = (struct orangefs_write_range *)page_private(page);
+ WARN_ON(wr->pos >= len);
off = wr->pos;
if (off + wr->len > len)
wlen = len - off;
@@ -79,6 +80,173 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
return ret;
}
+struct orangefs_writepages {
+ loff_t off;
+ size_t len;
+ kuid_t uid;
+ kgid_t gid;
+ int maxpages;
+ int npages;
+ struct page **pages;
+ struct bio_vec *bv;
+};
+
+static int orangefs_writepages_work(struct orangefs_writepages *ow,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = ow->pages[0]->mapping->host;
+ struct orangefs_write_range *wrp, wr;
+ struct iov_iter iter;
+ ssize_t ret;
+ size_t len;
+ loff_t off;
+ int i;
+
+ len = i_size_read(inode);
+
+ for (i = 0; i < ow->npages; i++) {
+ set_page_writeback(ow->pages[i]);
+ ow->bv[i].bv_page = ow->pages[i];
+ ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE,
+ ow->off + ow->len) -
+ max(ow->off, page_offset(ow->pages[i]));
+ if (i == 0)
+ ow->bv[i].bv_offset = ow->off -
+ page_offset(ow->pages[i]);
+ else
+ ow->bv[i].bv_offset = 0;
+ }
+ iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+
+ WARN_ON(ow->off >= len);
+ if (ow->off + ow->len > len)
+ ow->len = len - ow->off;
+
+ off = ow->off;
+ wr.uid = ow->uid;
+ wr.gid = ow->gid;
+ ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
+ 0, &wr);
+ if (ret < 0) {
+ for (i = 0; i < ow->npages; i++) {
+ SetPageError(ow->pages[i]);
+ mapping_set_error(ow->pages[i]->mapping, ret);
+ if (PagePrivate(ow->pages[i])) {
+ wrp = (struct orangefs_write_range *)
+ page_private(ow->pages[i]);
+ ClearPagePrivate(ow->pages[i]);
+ put_page(ow->pages[i]);
+ kfree(wrp);
+ }
+ end_page_writeback(ow->pages[i]);
+ unlock_page(ow->pages[i]);
+ }
+ } else {
+ ret = 0;
+ for (i = 0; i < ow->npages; i++) {
+ if (PagePrivate(ow->pages[i])) {
+ wrp = (struct orangefs_write_range *)
+ page_private(ow->pages[i]);
+ ClearPagePrivate(ow->pages[i]);
+ put_page(ow->pages[i]);
+ kfree(wrp);
+ }
+ end_page_writeback(ow->pages[i]);
+ unlock_page(ow->pages[i]);
+ }
+ }
+ return ret;
+}
+
+static int orangefs_writepages_callback(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct orangefs_writepages *ow = data;
+ struct orangefs_write_range *wr;
+ int ret;
+
+ if (!PagePrivate(page)) {
+ unlock_page(page);
+ /* It's not private so there's nothing to write, right? */
+ printk("writepages_callback not private!\n");
+ BUG();
+ return 0;
+ }
+ wr = (struct orangefs_write_range *)page_private(page);
+
+ ret = -1;
+ if (ow->npages == 0) {
+ ow->off = wr->pos;
+ ow->len = wr->len;
+ ow->uid = wr->uid;
+ ow->gid = wr->gid;
+ ow->pages[ow->npages++] = page;
+ ret = 0;
+ goto done;
+ }
+ if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
+ orangefs_writepages_work(ow, wbc);
+ ow->npages = 0;
+ ret = -1;
+ goto done;
+ }
+ if (ow->off + ow->len == wr->pos) {
+ ow->len += wr->len;
+ ow->pages[ow->npages++] = page;
+ ret = 0;
+ goto done;
+ }
+done:
+ if (ret == -1) {
+ if (ow->npages) {
+ orangefs_writepages_work(ow, wbc);
+ ow->npages = 0;
+ }
+ ret = orangefs_writepage_locked(page, wbc);
+ mapping_set_error(page->mapping, ret);
+ unlock_page(page);
+ end_page_writeback(page);
+ } else {
+ if (ow->npages == ow->maxpages) {
+ orangefs_writepages_work(ow, wbc);
+ ow->npages = 0;
+ }
+ }
+ return ret;
+}
+
+static int orangefs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct orangefs_writepages *ow;
+ struct blk_plug plug;
+ int ret;
+ ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
+ if (!ow)
+ return -ENOMEM;
+ ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
+ ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
+ if (!ow->pages) {
+ kfree(ow);
+ return -ENOMEM;
+ }
+ ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
+ if (!ow->bv) {
+ kfree(ow->pages);
+ kfree(ow);
+ return -ENOMEM;
+ }
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
+ if (ow->npages)
+ ret = orangefs_writepages_work(ow, wbc);
+ blk_finish_plug(&plug);
+ kfree(ow->pages);
+ kfree(ow->bv);
+ kfree(ow);
+ return ret;
+}
+
static int orangefs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
@@ -93,6 +261,9 @@ static int orangefs_readpage(struct file *file, struct page *page)
bv.bv_offset = 0;
iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
+ if (PageDirty(page))
+ orangefs_launder_page(page);
+
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
PAGE_SIZE, inode->i_size, NULL);
/* this will only zero remaining unread portions of the page data */
@@ -170,22 +341,42 @@ static int orangefs_write_begin(struct file *file,
set_page_private(page, (unsigned long)wr);
get_page(page);
okay:
-
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- unsigned from = pos & (PAGE_SIZE - 1);
-
- zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
- }
return 0;
}
static int orangefs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
{
- int r;
- r = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
+ struct inode *inode = page->mapping->host;
+ loff_t last_pos = pos + copied;
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold the i_mutex.
+ */
+ if (last_pos > inode->i_size)
+ i_size_write(inode, last_pos);
+
+ /* zero the stale part of the page if we did a short copy */
+ if (!PageUptodate(page)) {
+ unsigned from = pos & (PAGE_SIZE - 1);
+ if (copied < len) {
+ zero_user(page, from + copied, len - copied);
+ }
+ /* Set fully written pages uptodate. */
+ if (pos == page_offset(page) &&
+ (len == PAGE_SIZE || pos + len == inode->i_size)) {
+ zero_user_segment(page, from + copied, PAGE_SIZE);
+ SetPageUptodate(page);
+ }
+ }
+
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+
mark_inode_dirty_sync(file_inode(file));
- return r;
+ return copied;
}
static void orangefs_invalidatepage(struct page *page,
@@ -200,6 +391,7 @@ static void orangefs_invalidatepage(struct page *page,
set_page_private(page, 0);
ClearPagePrivate(page);
put_page(page);
+ return;
/* write range entirely within invalidate range (or equal) */
} else if (page_offset(page) + offset <= wr->pos &&
wr->pos + wr->len <= page_offset(page) + offset + length) {
@@ -209,6 +401,7 @@ static void orangefs_invalidatepage(struct page *page,
put_page(page);
/* XXX is this right? only caller in fs */
cancel_dirty_page(page);
+ return;
/* invalidate range chops off end of write range */
} else if (wr->pos < page_offset(page) + offset &&
wr->pos + wr->len <= page_offset(page) + offset + length &&
@@ -240,6 +433,7 @@ static void orangefs_invalidatepage(struct page *page,
* should we just ignore this and write it out anyway?
* it hardly makes sense
*/
+ return;
/* non-overlapping ranges */
} else {
/* WARN if they do overlap */
@@ -251,7 +445,15 @@ static void orangefs_invalidatepage(struct page *page,
printk("write range offset %llu length %zu\n",
wr->pos, wr->len);
}
+ return;
}
+
+ /*
+ * Above there are returns where wr is freed or where we WARN.
+ * Thus the following runs if wr was modified above.
+ */
+
+ orangefs_launder_page(page);
}
static int orangefs_releasepage(struct page *page, gfp_t foo)
@@ -404,6 +606,7 @@ out:
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readpage = orangefs_readpage,
+ .writepages = orangefs_writepages,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
@@ -418,9 +621,18 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret = VM_FAULT_LOCKED;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ unsigned long *bitlock = &orangefs_inode->bitlock;
+ vm_fault_t ret;
struct orangefs_write_range *wr;
+ sb_start_pagefault(inode->i_sb);
+
+ if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
lock_page(page);
if (PageDirty(page) && !PagePrivate(page)) {
/*
@@ -429,7 +641,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
* orangefs_writepage_locked.
*/
if (orangefs_launder_page(page)) {
- ret = VM_FAULT_RETRY;
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
}
@@ -442,14 +654,14 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
goto okay;
} else {
if (orangefs_launder_page(page)) {
- ret = VM_FAULT_RETRY;
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
}
}
wr = kmalloc(sizeof *wr, GFP_KERNEL);
if (!wr) {
- ret = VM_FAULT_RETRY;
+ ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
wr->pos = page_offset(page);
@@ -461,11 +673,10 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
get_page(page);
okay:
- sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- ret = VM_FAULT_NOPAGE;
+ ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
goto out;
}
@@ -476,6 +687,7 @@ okay:
*/
set_page_dirty(page);
wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -553,13 +765,15 @@ int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
} else {
gossip_debug(GOSSIP_UTILS_DEBUG,
"User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
}
if (iattr->ia_mode & (S_ISUID)) {
gossip_debug(GOSSIP_UTILS_DEBUG,
"Attempting to set setuid bit (not supported); returning EINVAL.\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
}
@@ -741,6 +955,8 @@ static int orangefs_set_inode(struct inode *inode, void *data)
ORANGEFS_I(inode)->refn.khandle = ref->khandle;
ORANGEFS_I(inode)->attr_valid = 0;
hash_init(ORANGEFS_I(inode)->xattr_cache);
+ ORANGEFS_I(inode)->mapping_time = jiffies - 1;
+ ORANGEFS_I(inode)->bitlock = 0;
return 0;
}
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 336a3ec0b83e..87beab10326a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -193,9 +193,11 @@ struct orangefs_inode_s {
sector_t last_failed_block_index_read;
unsigned long getattr_time;
+ unsigned long mapping_time;
int attr_valid;
kuid_t attr_uid;
kgid_t attr_gid;
+ unsigned long bitlock;
DECLARE_HASHTABLE(xattr_cache, 4);
};
@@ -390,6 +392,7 @@ bool __is_daemon_in_service(void);
/*
* defined in file.c
*/
+int orangefs_revalidate_mapping(struct inode *);
ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
struct iov_iter *, size_t, loff_t, struct orangefs_write_range *);
ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
@@ -427,6 +430,7 @@ int orangefs_normalize_to_errno(__s32 error_code);
extern struct mutex orangefs_request_mutex;
extern int op_timeout_secs;
extern int slot_timeout_secs;
+extern int orangefs_cache_timeout_msecs;
extern int orangefs_dcache_timeout_msecs;
extern int orangefs_getattr_timeout_msecs;
extern struct list_head orangefs_superblocks;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 85ef87245a87..82cf8b3e568b 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -30,6 +30,7 @@ static ulong module_parm_debug_mask;
__u64 orangefs_gossip_debug_mask;
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int orangefs_cache_timeout_msecs = 50;
int orangefs_dcache_timeout_msecs = 50;
int orangefs_getattr_timeout_msecs = 50;
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 19739aaee675..3627ea946402 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -62,6 +62,14 @@
* Slots are requested and waited for,
* the wait times out after slot_timeout_secs.
*
+ * What: /sys/fs/orangefs/cache_timeout_msecs
+ * Date: Mar 2018
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time in milliseconds between which
+ * orangefs_revalidate_mapping will invalidate the page
+ * cache.
+ *
* What: /sys/fs/orangefs/dcache_timeout_msecs
* Date: Jul 2016
* Contact: Martin Brandenburg <martin@omnibond.com>
@@ -222,6 +230,13 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
slot_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name,
+ "cache_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ orangefs_cache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(attr->attr.name,
"dcache_timeout_msecs")) {
rc = scnprintf(buf,
PAGE_SIZE,
@@ -277,6 +292,9 @@ static ssize_t sysfs_int_store(struct kobject *kobj,
} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
rc = kstrtoint(buf, 0, &slot_timeout_secs);
goto out;
+ } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs);
+ goto out;
} else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
goto out;
@@ -818,6 +836,9 @@ static struct orangefs_attribute op_timeout_secs_attribute =
static struct orangefs_attribute slot_timeout_secs_attribute =
__ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
+static struct orangefs_attribute cache_timeout_msecs_attribute =
+ __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
static struct orangefs_attribute dcache_timeout_msecs_attribute =
__ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
@@ -861,6 +882,7 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
static struct attribute *orangefs_default_attrs[] = {
&op_timeout_secs_attribute.attr,
&slot_timeout_secs_attribute.attr,
+ &cache_timeout_msecs_attribute.attr,
&dcache_timeout_msecs_attribute.attr,
&getattr_timeout_msecs_attribute.attr,
&readahead_count_attribute.attr,