From d76a3a77113db020d9bb1e894822869410450bd9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 22:02:52 -0400 Subject: ext4/jbd2: don't wait (forever) for stale tid caused by wraparound In the case where an inode has a very stale transaction id (tid) in i_datasync_tid or i_sync_tid, it's possible that after a very large (2**31) number of transactions, that the tid number space might wrap, causing tid_geq()'s calculations to fail. Commit deeeaf13 "jbd2: fix fsync() tid wraparound bug", later modified by commit e7b04ac0 "jbd2: don't wake kjournald unnecessarily", attempted to fix this problem, but it only avoided kjournald spinning forever by fixing the logic in jbd2_log_start_commit(). Unfortunately, in the codepaths in fs/ext4/fsync.c and fs/ext4/inode.c that might call jbd2_log_start_commit() with a stale tid, those functions will subsequently call jbd2_log_wait_commit() with the same stale tid, and then wait for a very long time. To fix this, we replace the calls to jbd2_log_start_commit() and jbd2_log_wait_commit() with a call to a new function, jbd2_complete_transaction(), which will correctly handle stale tid's. As a bonus, jbd2_complete_transaction() will avoid locking j_state_lock for writing unless a commit needs to be started. This should have a small (but probably not measurable) improvement for ext4's scalability. Signed-off-by: "Theodore Ts'o" Reported-by: Ben Hutchings Reported-by: George Barnett Cc: stable@vger.kernel.org --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 50e5a5e6a712..f0289754b464 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1200,6 +1200,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); -- cgit v1.2.3 From 794446c6946513c684d448205fbd76fa35f38b72 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Apr 2013 22:06:52 -0400 Subject: jbd2: fix race between jbd2_journal_remove_checkpoint and ->j_commit_callback The following race is possible: [kjournald2] other_task jbd2_journal_commit_transaction() j_state = T_FINISHED; spin_unlock(&journal->j_list_lock); ->jbd2_journal_remove_checkpoint() ->jbd2_journal_free_transaction(); ->kmem_cache_free(transaction) ->j_commit_callback(journal, transaction); -> USE_AFTER_FREE WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250() Hardware name: list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod Pid: 16400, comm: jbd2/dm-1-8 Tainted: G W 3.8.0-rc3+ #107 Call Trace: [] warn_slowpath_common+0xad/0xf0 [] warn_slowpath_fmt+0x46/0x50 [] ? ext4_journal_commit_callback+0x99/0xc0 [] __list_del_entry+0x1c0/0x250 [] ext4_journal_commit_callback+0x6f/0xc0 [] jbd2_journal_commit_transaction+0x23a6/0x2570 [] ? try_to_del_timer_sync+0x82/0xa0 [] ? del_timer_sync+0x91/0x1e0 [] kjournald2+0x19f/0x6a0 [] ? wake_up_bit+0x40/0x40 [] ? bit_spin_lock+0x80/0x80 [] kthread+0x10e/0x120 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_worker+0x70/0x70 In order to demonstrace this issue one should mount ext4 with mount -o discard option on SSD disk. This makes callback longer and race window becomes wider. In order to fix this we should mark transaction as finished only after callbacks have completed Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/commit.c | 50 ++++++++++++++++++++++++++++---------------------- include/linux/jbd2.h | 1 + 2 files changed, 29 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..0f53946f13c1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i, to_free = 0; + int i; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -1134,7 +1134,7 @@ restart_loop: journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); - commit_transaction->t_state = T_FINISHED; + commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; @@ -1149,38 +1149,44 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __jbd2_journal_drop_transaction(journal, commit_transaction); - to_free = 1; + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = commit_transaction; - } } spin_unlock(&journal->j_list_lock); - + /* Drop all spin_locks because commit_callback may be block. + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); - if (to_free) - jbd2_journal_free_transaction(commit_transaction); + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Recheck checkpoint lists after j_list_lock was dropped */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f0289754b464..f9fe88957b7a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -480,6 +480,7 @@ struct transaction_s T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, + T_COMMIT_CALLBACK, T_FINISHED } t_state; -- cgit v1.2.3 From ae4647fb7654676fc44a97e86eb35f9f06b99f66 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Apr 2013 00:03:42 -0400 Subject: jbd2: reduce journal_head size Remove unused t_cow_tid field (ext4 copy-on-write support doesn't seem to be happening) and change b_modified and b_jlist to bitfields thus saving 8 bytes in the structure. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- include/linux/journal-head.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index c18b46f8aeeb..13a3da25ff07 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -31,21 +31,14 @@ struct journal_head { /* * Journalling list for this buffer [jbd_lock_bh_state()] */ - unsigned b_jlist; + unsigned b_jlist:4; /* * This flag signals the buffer has been modified by * the currently running transaction * [jbd_lock_bh_state()] */ - unsigned b_modified; - - /* - * This feild tracks the last transaction id in which this buffer - * has been cowed - * [jbd_lock_bh_state()] - */ - tid_t b_cow_tid; + unsigned b_modified:1; /* * Copy of the buffer data frozen for writing to the log. -- cgit v1.2.3 From 28daf4fae8693d4a285123494899fe01950cba50 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 19 Apr 2013 17:49:23 -0400 Subject: jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset The jbd2_alloc_handle() function is only called by new_handle(). So this commit uses kmem_cache_zalloc() instead of kmem_cache_alloc()/memset(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 1 - include/linux/jbd2.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 325bc019ed88..a1920da22802 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f9fe88957b7a..6e051f472edb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1145,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache; static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) { - return kmem_cache_alloc(jbd2_handle_cache, gfp_flags); + return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); } static inline void jbd2_free_handle(handle_t *handle) -- cgit v1.2.3 From 877f962c5edacfef60ab21cfed6d8d54ce25b8a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Apr 2013 19:58:37 -0400 Subject: buffer: add BH_Prio and BH_Meta flags Add buffer_head flags so that buffer cache writebacks can be marked with the the appropriate request flags, so that metadata blocks can be marked appropriately in blktrace. Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 5 +++++ include/linux/buffer_head.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..a15575c0b9ee 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2988,6 +2988,11 @@ int submit_bh(int rw, struct buffer_head * bh) /* Take care of bh's that straddle the end of the device */ guard_bh_eod(rw, bio, bh); + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + bio_get(bio); submit_bio(rw, bio); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5afc4f94d110..33c0f8103fe4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -34,6 +34,8 @@ enum bh_state_bits { BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ + BH_Meta, /* Buffer contains metadata */ + BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) +BUFFER_FNS(Meta, meta) +BUFFER_FNS(Prio, prio) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) -- cgit v1.2.3