diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-09-14 20:26:27 +0200 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2010-10-14 18:38:50 +0200 |
commit | e9e6f3ec535d7b7c9e2ca64ad691e743e7d3c2f0 (patch) | |
tree | cbc17d81b9d937b4fc515548f30f5ed00be193ee /drivers/block/drbd/drbd_int.h | |
parent | 22cc37a943832c948808884604ec6f5ff2594c1d (diff) | |
download | lwn-e9e6f3ec535d7b7c9e2ca64ad691e743e7d3c2f0.tar.gz lwn-e9e6f3ec535d7b7c9e2ca64ad691e743e7d3c2f0.zip |
drbd: fix for possible deadlock on IO error during resync
Scenario:
Something (say, flush-147:0) is in drbd_al_begin_io,
holding a local_cnt, waiting for the resync to make progress.
Disk fails, worker in after_state_ch does drbd_rs_cancel_all,
then waits for local_cnt to drop to zero.
flush-147:0 is woken by drbd_rs_cancel_all, needs to write an AL
transaction, and queues that on the worker.
Deadlock.
Fix: do not wait in the worker, have put_ldev() trigger the
state change D_FAILED -> D_DISKLESS when necessary.
put_ldev() cannot do the state change directly, as it may or may not
already hold various spinlocks. We queue a short work instead.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_int.h')
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8ab6fed39539..c07c370c4c82 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -852,6 +852,7 @@ enum { BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ + GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -976,6 +977,7 @@ struct drbd_conf { unsigned int ko_count; struct drbd_work resync_work, unplug_work, + go_diskless, md_sync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; @@ -1278,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); +extern void drbd_go_diskless(struct drbd_conf *mdev); /* Meta data layout @@ -2123,8 +2126,11 @@ static inline void put_ldev(struct drbd_conf *mdev) int i = atomic_dec_return(&mdev->local_cnt); __release(local); D_ASSERT(i >= 0); - if (i == 0) + if (i == 0) { + if (mdev->state.disk == D_FAILED) + drbd_go_diskless(mdev); wake_up(&mdev->misc_wait); + } } #ifndef __CHECKER__ |