From 7c70b896951c84d63e6d71b82668f9c8b8bbd440 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 25 Mar 2019 09:34:19 -0600 Subject: gfs2: clean_journal improperly set sd_log_flush_head This patch fixes regressions in 588bff95c94efc05f9e1a0b19015c9408ed7c0ef. Due to that patch, function clean_journal was setting the value of sd_log_flush_head, but that's only valid if it is replaying the node's own journal. If it's replaying another node's journal, that's completely wrong and will lead to multiple problems. This patch tries to clean up the mess by passing the value of the logical journal block number into gfs2_write_log_header so the function can treat non-owned journals generically. For the local journal, the journal extent map is used for best performance. For other nodes from other journals, new function gfs2_lblk_to_dblk is called to figure it out using gfs2_iomap_get. This patch also tries to establish more consistency when passing journal block parameters by changing several unsigned int types to a consistent u32. Fixes: 588bff95c94e ("GFS2: Reduce code redundancy writing log headers") Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/lops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8722c60b11fe..aef21b6a608f 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -530,7 +530,7 @@ static void buf_lo_before_scan(struct gfs2_jdesc *jd, jd->jd_replayed_blocks = 0; } -static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass) { @@ -685,7 +685,7 @@ static void revoke_lo_before_scan(struct gfs2_jdesc *jd, jd->jd_replay_tail = head->lh_tail; } -static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass) { @@ -767,7 +767,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); } -static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass) { -- cgit v1.2.3 From 9287c6452d2b1f24ea8e84bd3cf6f3c6f267f712 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 4 Apr 2019 21:11:11 +0100 Subject: gfs2: Fix occasional glock use-after-free This patch has to do with the life cycle of glocks and buffers. When gfs2 metadata or journaled data is queued to be written, a gfs2_bufdata object is assigned to track the buffer, and that is queued to various lists, including the glock's gl_ail_list to indicate it's on the active items list. Once the page associated with the buffer has been written, it is removed from the ail list, but its life isn't over until a revoke has been successfully written. So after the block is written, its bufdata object is moved from the glock's gl_ail_list to a file-system-wide list of pending revokes, sd_log_le_revoke. At that point the glock still needs to track how many revokes it contributed to that list (in gl_revokes) so that things like glock go_sync can ensure all the metadata has been not only written, but also revoked before the glock is granted to a different node. This is to guarantee journal replay doesn't replay the block once the glock has been granted to another node. Ross Lagerwall recently discovered a race in which an inode could be evicted, and its glock freed after its ail list had been synced, but while it still had unwritten revokes on the sd_log_le_revoke list. The evict decremented the glock reference count to zero, which allowed the glock to be freed. After the revoke was written, function revoke_lo_after_commit tried to adjust the glock's gl_revokes counter and clear its GLF_LFLUSH flag, at which time it referenced the freed glock. This patch fixes the problem by incrementing the glock reference count in gfs2_add_revoke when the glock's first bufdata object is moved from the glock to the global revokes list. Later, when the glock's last such bufdata object is freed, the reference count is decremented. This guarantees that whichever process finishes last (the revoke writing or the evict) will properly free the glock, and neither will reference the glock after it has been freed. Reported-by: Ross Lagerwall Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 1 + fs/gfs2/log.c | 3 ++- fs/gfs2/lops.c | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e4f6d39500bc..71c28ff98b56 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -140,6 +140,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + BUG_ON(atomic_read(&gl->gl_revokes)); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ebbc68dca145..7ba94b66c91b 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -606,7 +606,8 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index aef21b6a608f..b4f0a6a3ba59 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -669,8 +669,10 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - atomic_dec(&gl->gl_revokes); - clear_bit(GLF_LFLUSH, &gl->gl_flags); + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } kmem_cache_free(gfs2_bufdata_cachep, bd); } } -- cgit v1.2.3 From 73118ca8baf78dddd1f9c8ac67c1d80b47d9830e Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 5 Apr 2019 04:41:38 +0100 Subject: gfs2: Replace gl_revokes with a GLF flag The gl_revokes value determines how many outstanding revokes a glock has on the superblock revokes list; this is used to avoid unnecessary log flushes. However, gl_revokes is only ever tested for being zero, and it's only decremented in revoke_lo_after_commit, which removes all revokes from the list, so we know that the gl_revoke values of all the glocks on the list will reach zero. Therefore, we can replace gl_revokes with a bit flag. This saves an atomic counter in struct gfs2_glock. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 4 ++-- fs/gfs2/incore.h | 2 +- fs/gfs2/log.c | 4 +++- fs/gfs2/lops.c | 33 ++++++++++++++++++++++++--------- fs/gfs2/main.c | 1 - fs/gfs2/super.c | 2 +- 6 files changed, 31 insertions(+), 15 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 71c28ff98b56..15c605cfcfc8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -140,7 +140,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - BUG_ON(atomic_read(&gl->gl_revokes)); + BUG_ON(test_bit(GLF_REVOKES, &gl->gl_flags)); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); @@ -1801,7 +1801,7 @@ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl) state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), - atomic_read(&gl->gl_revokes), + test_bit(GLF_REVOKES, &gl->gl_flags) ? 1 : 0, (int)gl->gl_lockref.count, gl->gl_hold_time); list_for_each_entry(gh, &gl->gl_holders, gh_list) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 86840a70ee1a..6a94b094a904 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -345,6 +345,7 @@ enum { GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, GLF_INODE_CREATING = 16, /* Inode creation occurring */ + GLF_REVOKES = 17, /* Glock has revokes in queue */ }; struct gfs2_glock { @@ -374,7 +375,6 @@ struct gfs2_glock { struct list_head gl_lru; struct list_head gl_ail_list; atomic_t gl_ail_count; - atomic_t gl_revokes; struct delayed_work gl_work; union { /* For inode and iopen glocks only */ diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 7ba94b66c91b..d55315a46ece 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -606,8 +606,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; sdp->sd_log_num_revoke++; - if (atomic_inc_return(&gl->gl_revokes) == 1) + if (!test_bit(GLF_REVOKES, &gl->gl_flags)) { + set_bit(GLF_REVOKES, &gl->gl_flags); gfs2_glock_hold(gl); + } set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index b4f0a6a3ba59..2fd61853ba63 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -662,19 +662,34 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_revoke; - struct gfs2_bufdata *bd; - struct gfs2_glock *gl; + struct gfs2_bufdata *bd, *tmp; - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list); - list_del_init(&bd->bd_list); - gl = bd->bd_gl; - if (atomic_dec_return(&gl->gl_revokes) == 0) { - clear_bit(GLF_LFLUSH, &gl->gl_flags); - gfs2_glock_queue_put(gl); + /* + * Glocks can be referenced repeatedly on the revoke list, but the list + * only holds one reference. All glocks on the list will have the + * GLF_REVOKES flag set initially. + */ + + list_for_each_entry_safe(bd, tmp, head, bd_list) { + struct gfs2_glock *gl = bd->bd_gl; + + if (test_bit(GLF_REVOKES, &gl->gl_flags)) { + /* Keep each glock on the list exactly once. */ + clear_bit(GLF_REVOKES, &gl->gl_flags); + continue; } + list_del(&bd->bd_list); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + list_for_each_entry_safe(bd, tmp, head, bd_list) { + struct gfs2_glock *gl = bd->bd_gl; + + list_del(&bd->bd_list); kmem_cache_free(gfs2_bufdata_cachep, bd); + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); } + /* the list is empty now */ } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 136484ef35d3..c700738de1f7 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -59,7 +59,6 @@ static void gfs2_init_glock_once(void *foo) INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); - atomic_set(&gl->gl_revokes, 0); } static void gfs2_init_gl_aspace_once(void *foo) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7b8d2306b3d3..2a7692d70394 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1474,7 +1474,7 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); truncate_inode_pages(&inode->i_data, 0); - if (atomic_read(&gl->gl_revokes) == 0) { + if (!test_bit(GLF_REVOKES, &gl->gl_flags)) { clear_bit(GLF_LFLUSH, &gl->gl_flags); clear_bit(GLF_DIRTY, &gl->gl_flags); } -- cgit v1.2.3 From 32ac43f6a4ebf398c82e4b541fbe1229b2e47384 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 4 Apr 2019 15:32:41 +0100 Subject: gfs2: Remove unnecessary extern declarations Make log operations statuc; they are only used locally. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/lops.c | 6 +++--- fs/gfs2/lops.h | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 2fd61853ba63..6c1ec6c60639 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -870,7 +870,7 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) } -const struct gfs2_log_operations gfs2_buf_lops = { +static const struct gfs2_log_operations gfs2_buf_lops = { .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -879,7 +879,7 @@ const struct gfs2_log_operations gfs2_buf_lops = { .lo_name = "buf", }; -const struct gfs2_log_operations gfs2_revoke_lops = { +static const struct gfs2_log_operations gfs2_revoke_lops = { .lo_before_commit = revoke_lo_before_commit, .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, @@ -888,7 +888,7 @@ const struct gfs2_log_operations gfs2_revoke_lops = { .lo_name = "revoke", }; -const struct gfs2_log_operations gfs2_databuf_lops = { +static const struct gfs2_log_operations gfs2_databuf_lops = { .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 4e81742de7a0..320fbf28d2fb 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -20,11 +20,6 @@ ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ ~(2 * sizeof(__be64) - 1)) -extern const struct gfs2_log_operations gfs2_glock_lops; -extern const struct gfs2_log_operations gfs2_buf_lops; -extern const struct gfs2_log_operations gfs2_revoke_lops; -extern const struct gfs2_log_operations gfs2_databuf_lops; - extern const struct gfs2_log_operations *gfs2_log_ops[]; extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, -- cgit v1.2.3 From a5b1d3fc503164bb04e2b720054ab07d8a0004fc Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 5 Apr 2019 12:16:14 +0100 Subject: gfs2: Rename sd_log_le_{revoke,ordered} Rename sd_log_le_revoke to sd_log_revokes and sd_log_le_ordered to sd_log_ordered: not sure what le stands for here, but it doesn't add clarity, and if it stands for list entry, it's actually confusing as those are both list heads but not list entries. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 4 ++-- fs/gfs2/log.c | 14 +++++++------- fs/gfs2/log.h | 2 +- fs/gfs2/lops.c | 4 ++-- fs/gfs2/ops_fstype.c | 4 ++-- fs/gfs2/trans.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6a94b094a904..78c8e761b321 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -809,8 +809,8 @@ struct gfs2_sbd { atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; - struct list_head sd_log_le_revoke; - struct list_head sd_log_le_ordered; + struct list_head sd_log_revokes; + struct list_head sd_log_ordered; spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index d55315a46ece..a7febb4bd400 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -551,9 +551,9 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) LIST_HEAD(written); spin_lock(&sdp->sd_ordered_lock); - list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); - while (!list_empty(&sdp->sd_log_le_ordered)) { - ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp); + while (!list_empty(&sdp->sd_log_ordered)) { + ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered); if (ip->i_inode.i_mapping->nrpages == 0) { test_and_clear_bit(GIF_ORDERED, &ip->i_flags); list_del(&ip->i_ordered); @@ -564,7 +564,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) filemap_fdatawrite(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); } - list_splice(&written, &sdp->sd_log_le_ordered); + list_splice(&written, &sdp->sd_log_ordered); spin_unlock(&sdp->sd_ordered_lock); } @@ -573,8 +573,8 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) struct gfs2_inode *ip; spin_lock(&sdp->sd_ordered_lock); - while (!list_empty(&sdp->sd_log_le_ordered)) { - ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + while (!list_empty(&sdp->sd_log_ordered)) { + ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered); list_del(&ip->i_ordered); WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); if (ip->i_inode.i_mapping->nrpages == 0) @@ -611,7 +611,7 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) gfs2_glock_hold(gl); } set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); + list_add(&bd->bd_list, &sdp->sd_log_revokes); } void gfs2_write_revokes(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 86d07d436cdf..7a34a3234266 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -59,7 +59,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) if (!test_bit(GIF_ORDERED, &ip->i_flags)) { spin_lock(&sdp->sd_ordered_lock); if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) - list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + list_add(&ip->i_ordered, &sdp->sd_log_ordered); spin_unlock(&sdp->sd_ordered_lock); } } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6c1ec6c60639..6af6a3cea967 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -623,7 +623,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_meta_header *mh; unsigned int offset; - struct list_head *head = &sdp->sd_log_le_revoke; + struct list_head *head = &sdp->sd_log_revokes; struct gfs2_bufdata *bd; struct page *page; unsigned int length; @@ -661,7 +661,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_revoke; + struct list_head *head = &sdp->sd_log_revokes; struct gfs2_bufdata *bd, *tmp; /* diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b041cb8ae383..abfaecde0e3d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,8 +117,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); - INIT_LIST_HEAD(&sdp->sd_log_le_revoke); - INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + INIT_LIST_HEAD(&sdp->sd_log_revokes); + INIT_LIST_HEAD(&sdp->sd_log_ordered); spin_lock_init(&sdp->sd_ordered_lock); init_waitqueue_head(&sdp->sd_log_waitq); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index cd9a94a6b5bb..5c4eae3b69fb 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -260,7 +260,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) unsigned int n = len; gfs2_log_lock(sdp); - list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) { + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_revokes, bd_list) { if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); -- cgit v1.2.3 From f4686c26ecc34e8e458b8235f0af5198c9b13bfd Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Thu, 2 May 2019 14:17:40 -0500 Subject: gfs2: read journal in large chunks Use bios to read in the journal into the address space of the journal inode (jd_inode), sequentially and in large chunks. This is faster for locating the journal head that the previous binary search approach. When performing recovery, we keep the journal in the address space until recovery is done, which further speeds up things. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 3 +- fs/gfs2/log.c | 4 +- fs/gfs2/lops.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 3 +- fs/gfs2/recovery.c | 125 +----------------------------- fs/gfs2/recovery.h | 2 - fs/gfs2/super.c | 5 +- 8 files changed, 219 insertions(+), 139 deletions(-) (limited to 'fs/gfs2/lops.c') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78510ab91835..24ada3ccc525 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; @@ -531,7 +532,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - error = gfs2_find_jhead(sdp->sd_jdesc, &head); + error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); if (error) gfs2_consist(sdp); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a7febb4bd400..a2e1df488df0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -744,7 +744,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); log_flush_wait(sdp); } @@ -821,7 +821,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6af6a3cea967..ce048a9e058d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,7 +17,9 @@ #include #include #include +#include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -194,7 +196,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -232,20 +233,19 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @op: REQ_OP - * @op_flags: req_flag_bits + * @opf: REQ_OP | op_flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int opf) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = opf; submit_bio(bio); *biop = NULL; } @@ -306,7 +306,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op, 0); + gfs2_log_submit_bio(biop, op); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -377,6 +377,206 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + int i; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, i, iter_all) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + } + unlock_page(page); + } + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host uninitialized_var(lh); + void *kaddr = kmap_atomic(page); + unsigned int offset; + bool ret = false; + + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { + if (lh.lh_sequence > head->lh_sequence) + *head = lh; + else { + ret = true; + break; + } + } + } + kunmap_atomic(kaddr); + return ret; +} + +/** + * gfs2_jhead_process_page - Search/cleanup a page + * @jd: The journal descriptor + * @index: Index of the page to look into + * @done: If set, perform only cleanup, else search and set if found. + * + * Find the page with 'index' in the journal's mapping. Search the page for + * the journal head if requested (cleanup == false). Release refs on the + * page so the page cache can reclaim it (put_page() twice). We grabbed a + * reference on this page two times, first when we did a find_or_create_page() + * to obtain the page to add it to the bio and second when we do a + * find_get_page() here to get the page to wait on while I/O on it is being + * completed. + * This function is also used to free up a page we might've grabbed but not + * used. Maybe we added it to a bio, but not submitted it for I/O. Or we + * submitted the I/O, but we already found the jhead so we only need to drop + * our references to the page. + */ + +static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, + struct gfs2_log_header_host *head, + bool *done) +{ + struct page *page; + + page = find_get_page(jd->jd_inode->i_mapping, index); + wait_on_page_locked(page); + + if (PageError(page)) + *done = true; + + if (!*done) + *done = gfs2_jhead_pg_srch(jd, head, page); + + put_page(page); /* Once for find_get_page */ + put_page(page); /* Once more for find_or_create_page */ +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: The journal descriptor + * @head: The log descriptor for the head of the log is returned here + * + * Do a search of a journal by reading it in large chunks using bios and find + * the valid log entry with the highest sequence number. (i.e. the log head) + * + * Returns: 0 on success, errno otherwise + */ +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, + bool keep_cache) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct address_space *mapping = jd->jd_inode->i_mapping; + unsigned int block = 0, blocks_submitted = 0, blocks_read = 0; + unsigned int bsize = sdp->sd_sb.sb_bsize; + unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; + unsigned int shift = PAGE_SHIFT - bsize_shift; + unsigned int readhead_blocks = BIO_MAX_PAGES << shift; + struct gfs2_journal_extent *je; + int sz, ret = 0; + struct bio *bio = NULL; + struct page *page = NULL; + bool done = false; + errseq_t since; + + memset(head, 0, sizeof(*head)); + if (list_empty(&jd->extent_list)) + gfs2_map_journal_extents(sdp, jd); + + since = filemap_sample_wb_err(mapping); + list_for_each_entry(je, &jd->extent_list, list) { + for (; block < je->lblock + je->blocks; block++) { + u64 dblock; + + if (!page) { + page = find_or_create_page(mapping, + block >> shift, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + done = true; + goto out; + } + } + + if (bio) { + unsigned int off; + + off = (block << bsize_shift) & ~PAGE_MASK; + sz = bio_add_page(bio, page, bsize, off); + if (sz == bsize) { /* block added */ + if (off + bsize == PAGE_SIZE) { + page = NULL; + goto page_added; + } + continue; + } + blocks_submitted = block + 1; + submit_bio(bio); + bio = NULL; + } + + dblock = je->dblock + (block - je->lblock); + bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); + bio->bi_opf = REQ_OP_READ; + sz = bio_add_page(bio, page, bsize, 0); + gfs2_assert_warn(sdp, sz == bsize); + if (bsize == PAGE_SIZE) + page = NULL; + +page_added: + if (blocks_submitted < blocks_read + readhead_blocks) { + /* Keep at least one bio in flight */ + continue; + } + + gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done); + blocks_read += PAGE_SIZE >> bsize_shift; + if (done) + goto out; /* found */ + } + } + +out: + if (bio) + submit_bio(bio); + while (blocks_read < block) { + gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done); + blocks_read += PAGE_SIZE >> bsize_shift; + } + + if (!ret) + ret = filemap_check_wb_err(mapping, since); + + if (!keep_cache) + truncate_inode_pages(mapping, 0); + + return ret; +} + static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, u32 ld_length, u32 ld_data1) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 320fbf28d2fb..f195ffb435ac 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -25,8 +25,10 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags); +extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, bool keep_cache); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index abfaecde0e3d..46f6615eaf12 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -41,6 +41,7 @@ #include "dir.h" #include "meta_io.h" #include "trace_gfs2.h" +#include "lops.h" #define DO 0 #define UNDO 1 @@ -616,7 +617,7 @@ static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) fs_err(sdp, "Error checking journal for spectator mount.\n"); goto out_unlock; } - error = gfs2_find_jhead(jd, &head); + error = gfs2_find_jhead(jd, &head, false); if (error) { fs_err(sdp, "Error parsing journal for spectator mount.\n"); goto out_unlock; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index fa575d1676b9..389b3ef77e20 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -181,129 +181,6 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, return error; } -/** - * find_good_lh - find a good log header - * @jd: the journal - * @blk: the segment to start searching from - * @lh: the log header to fill in - * @forward: if true search forward in the log, else search backward - * - * Call get_log_header() to get a log header for a segment, but if the - * segment is bad, either scan forward or backward until we find a good one. - * - * Returns: errno - */ - -static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, - struct gfs2_log_header_host *head) -{ - unsigned int orig_blk = *blk; - int error; - - for (;;) { - error = get_log_header(jd, *blk, head); - if (error <= 0) - return error; - - if (++*blk == jd->jd_blocks) - *blk = 0; - - if (*blk == orig_blk) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - } -} - -/** - * jhead_scan - make sure we've found the head of the log - * @jd: the journal - * @head: this is filled in with the log descriptor of the head - * - * At this point, seg and lh should be either the head of the log or just - * before. Scan forward until we find the head. - * - * Returns: errno - */ - -static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - unsigned int blk = head->lh_blkno; - struct gfs2_log_header_host lh; - int error; - - for (;;) { - if (++blk == jd->jd_blocks) - blk = 0; - - error = get_log_header(jd, blk, &lh); - if (error < 0) - return error; - if (error == 1) - continue; - - if (lh.lh_sequence == head->lh_sequence) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - if (lh.lh_sequence < head->lh_sequence) - break; - - *head = lh; - } - - return 0; -} - -/** - * gfs2_find_jhead - find the head of a log - * @jd: the journal - * @head: the log descriptor for the head of the log is returned here - * - * Do a binary search of a journal and find the valid log entry with the - * highest sequence number. (i.e. the log head) - * - * Returns: errno - */ - -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - struct gfs2_log_header_host lh_1, lh_m; - u32 blk_1, blk_2, blk_m; - int error; - - blk_1 = 0; - blk_2 = jd->jd_blocks - 1; - - for (;;) { - blk_m = (blk_1 + blk_2) / 2; - - error = find_good_lh(jd, &blk_1, &lh_1); - if (error) - return error; - - error = find_good_lh(jd, &blk_m, &lh_m); - if (error) - return error; - - if (blk_1 == blk_m || blk_m == blk_2) - break; - - if (lh_1.lh_sequence <= lh_m.lh_sequence) - blk_1 = blk_m; - else - blk_2 = blk_m; - } - - error = jhead_scan(jd, &lh_1); - if (error) - return error; - - *head = lh_1; - - return error; -} - /** * foreach_descriptor - go through the active part of the log * @jd: the journal @@ -469,7 +346,7 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - error = gfs2_find_jhead(jd, &head); + error = gfs2_find_jhead(jd, &head, true); if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 5932d4b6f43e..1831a1974c8c 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -27,8 +27,6 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); extern int __get_log_header(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5b148cf03293..fbf6b1fd330b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -45,6 +45,7 @@ #include "util.h" #include "sys.h" #include "xattr.h" +#include "lops.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) @@ -425,7 +426,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - error = gfs2_find_jhead(sdp->sd_jdesc, &head); + error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); if (error) goto fail; @@ -680,7 +681,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, error = gfs2_jdesc_check(jd); if (error) break; - error = gfs2_find_jhead(jd, &lh); + error = gfs2_find_jhead(jd, &lh, false); if (error) break; if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { -- cgit v1.2.3