https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=169302
If you lose all the paths to a dm multipath device you can set
queue_if_no_path so that I/O will get queued. If one of your paths comes
back, it will be re-enabled and the queued I/O will be released.
The current implementation does not allow the paths to be reconfigured (e.g.
adding new paths) while I/O is queued like this. The I/O gets discarded
(with -EIO) first. Some consider this a bug.
This patch allows the multipath tools to set DM_NOFLUSH_FLAG during a
suspend operation to request that I/O be queued during such a path
reconfiguration. Targets such as multipath then transfer ('push back') their
queue of waiting I/O onto the existing queue held in the device-mapper core
during a 'suspend' operation.
[NEC]
Index: linux-2.6.18-rc7/drivers/md/dm.c
===================================================================
--- linux-2.6.18-rc7.orig/drivers/md/dm.c 2006-09-18 23:28:03.000000000 +0100
+++ linux-2.6.18-rc7/drivers/md/dm.c 2006-10-06 18:47:22.000000000 +0100
@@ -68,10 +68,12 @@ union map_info *dm_get_mapinfo(struct bi
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
+#define DMF_NOFLUSH_SUSPENDING 5
struct mapped_device {
struct rw_semaphore io_lock;
struct semaphore suspend_lock;
+ spinlock_t pushback_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
@@ -90,6 +92,7 @@ struct mapped_device {
atomic_t pending;
wait_queue_head_t wait;
struct bio_list deferred;
+ struct bio_list pushback;
/*
* The current mapping.
@@ -444,23 +447,50 @@ int dm_set_geometry(struct mapped_device
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
+static int __noflush_suspending(struct mapped_device *md)
+{
+ return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+}
+
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
static void dec_pending(struct dm_io *io, int error)
{
- if (error)
+ unsigned long flags;
+
+ /* Push-back supersedes any I/O errors */
+ if (error && !(io->error > 0 && __noflush_suspending(io->md)))
io->error = error;
if (atomic_dec_and_test(&io->io_count)) {
+ if (io->error == DM_ENDIO_REQUEUE) {
+ /*
+ * Target requested pushing back the I/O.
+ * This must be handled before the sleeper on
+ * suspend queue merges the pushback list.
+ */
+ spin_lock_irqsave(&io->md->pushback_lock, flags);
+ if (__noflush_suspending(io->md))
+ bio_list_add(&io->md->pushback, io->bio);
+ else
+ /* noflush suspend was interrupted. */
+ io->error = -EIO;
+ spin_unlock_irqrestore(&io->md->pushback_lock, flags);
+ }
+
if (end_io_acct(io))
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);
- blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+ if (io->error != DM_ENDIO_REQUEUE) {
+ blk_add_trace_bio(io->md->queue, io->bio,
+ BLK_TA_COMPLETE);
+
+ bio_endio(io->bio, io->bio->bi_size, io->error);
+ }
- bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
}
@@ -480,12 +510,19 @@ static int clone_endio(struct bio *bio,
if (endio) {
r = endio(tio->ti, bio, error, &tio->info);
- if (r < 0)
+ if (r < 0 || r == DM_ENDIO_REQUEUE)
+ /*
+ * error and requeue request are handled
+ * in dec_pending().
+ */
error = r;
-
- else if (r > 0)
- /* the target wants another shot at the io */
+ else if (r == DM_ENDIO_INCOMPLETE)
+ /* The target will handle the io */
return 1;
+ else if (r) {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
}
dec_pending(tio->io, error);
@@ -543,7 +580,7 @@ static void __map_bio(struct dm_target *
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
- if (r > 0) {
+ if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
@@ -551,10 +588,8 @@ static void __map_bio(struct dm_target *
clone->bi_sector);
generic_make_request(clone);
- }
-
- else if (r < 0) {
- /* error the io and bail out */
+ } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
+ /* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
/*
@@ -563,6 +598,9 @@ static void __map_bio(struct dm_target *
clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
+ } else if (r) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
}
}
@@ -948,6 +986,7 @@ static struct mapped_device *alloc_dev(i
memset(md, 0, sizeof(*md));
init_rwsem(&md->io_lock);
init_MUTEX(&md->suspend_lock);
+ spin_lock_init(&md->pushback_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
@@ -1275,12 +1314,15 @@ static void unlock_fs(struct mapped_devi
* dm_bind_table, dm_suspend must be called to flush any in
* flight bios and ensure that any further io gets deferred.
*/
-int dm_suspend(struct mapped_device *md, int do_lockfs)
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
+ unsigned long flags;
DECLARE_WAITQUEUE(wait, current);
struct bio *def;
int r = -EINVAL;
+ int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
+ int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
down(&md->suspend_lock);
@@ -1289,6 +1331,13 @@ int dm_suspend(struct mapped_device *md,
map = dm_get_table(md);
+ /*
+ * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
+ * This flag is cleared before dm_suspend returns.
+ */
+ if (noflush)
+ set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+
/* This does not get reverted if there's an error later. */
dm_table_presuspend_targets(map);
@@ -1296,11 +1345,14 @@ int dm_suspend(struct mapped_device *md,
if (!md->suspended_bdev) {
DMWARN("bdget failed in dm_suspend");
r = -ENOMEM;
- goto out;
+ goto flush_and_out;
}
- /* Flush I/O to the device. */
- if (do_lockfs) {
+ /*
+ * Flush I/O to the device.
+ * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
+ */
+ if (do_lockfs && !noflush) {
r = lock_fs(md);
if (r)
goto out;
@@ -1336,6 +1388,14 @@ int dm_suspend(struct mapped_device *md,
down_write(&md->io_lock);
remove_wait_queue(&md->wait, &wait);
+ if (noflush) {
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+ }
+
/* were we interrupted ? */
r = -EINTR;
if (atomic_read(&md->pending)) {
@@ -1344,7 +1404,7 @@ int dm_suspend(struct mapped_device *md,
__flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md);
- goto out;
+ goto out; /* pushback list is already flushed, so skip flush */
}
up_write(&md->io_lock);
@@ -1354,6 +1414,25 @@ int dm_suspend(struct mapped_device *md,
r = 0;
+flush_and_out:
+ if (r && noflush) {
+ /*
+ * Because there may be already I/Os in the pushback list,
+ * flush them before return.
+ */
+ down_write(&md->io_lock);
+
+ spin_lock_irqsave(&md->pushback_lock, flags);
+ clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+ bio_list_merge_head(&md->deferred, &md->pushback);
+ bio_list_init(&md->pushback);
+ spin_unlock_irqrestore(&md->pushback_lock, flags);
+
+ def = bio_list_get(&md->deferred);
+ __flush_deferred_io(md, def);
+ up_write(&md->io_lock);
+ }
+
out:
if (r && md->suspended_bdev) {
bdput(md->suspended_bdev);
@@ -1438,6 +1517,17 @@ int dm_suspended(struct mapped_device *m
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_noflush_suspending(struct dm_target *ti)
+{
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ int r = __noflush_suspending(md);
+
+ dm_put(md);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
Index: linux-2.6.18-rc7/drivers/md/dm-ioctl.c
===================================================================
--- linux-2.6.18-rc7.orig/drivers/md/dm-ioctl.c 2006-10-06 15:10:44.000000000 +0100
+++ linux-2.6.18-rc7/drivers/md/dm-ioctl.c 2006-10-06 18:47:22.000000000 +0100
@@ -765,7 +765,7 @@ out:
static int do_suspend(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct mapped_device *md;
md = find_device(param);
@@ -773,10 +773,12 @@ static int do_suspend(struct dm_ioctl *p
return -ENXIO;
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- r = dm_suspend(md, do_lockfs);
+ r = dm_suspend(md, suspend_flags);
if (!r)
r = __dev_status(md, param);
@@ -788,7 +790,7 @@ static int do_suspend(struct dm_ioctl *p
static int do_resume(struct dm_ioctl *param)
{
int r = 0;
- int do_lockfs = 1;
+ unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct hash_cell *hc;
struct mapped_device *md;
struct dm_table *new_map;
@@ -814,9 +816,11 @@ static int do_resume(struct dm_ioctl *pa
if (new_map) {
/* Suspend if it isn't already suspended */
if (param->flags & DM_SKIP_LOCKFS_FLAG)
- do_lockfs = 0;
+ suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
+ if (param->flags & DM_NOFLUSH_FLAG)
+ suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
- dm_suspend(md, do_lockfs);
+ dm_suspend(md, suspend_flags);
r = dm_swap_table(md, new_map);
if (r) {
Index: linux-2.6.18-rc7/include/linux/device-mapper.h
===================================================================
--- linux-2.6.18-rc7.orig/include/linux/device-mapper.h 2006-09-14 23:24:03.000000000 +0100
+++ linux-2.6.18-rc7/include/linux/device-mapper.h 2006-10-06 18:47:22.000000000 +0100
@@ -39,7 +39,8 @@ typedef void (*dm_dtr_fn) (struct dm_tar
* The map function must return:
* < 0: error
* = 0: The target will handle the io by resubmitting it later
- * > 0: simple remap complete
+ * = 1: simple remap complete
+ * = 2: The target wants to push back the io
*/
typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
union map_info *map_context);
@@ -50,6 +51,7 @@ typedef int (*dm_map_fn) (struct dm_targ
* 0 : ended successfully
* 1 : for some reason the io has still not completed (eg,
* multipath target might want to requeue a failed io).
+ * 2 : The target wants to push back the io
*/
typedef int (*dm_endio_fn) (struct dm_target *ti,
struct bio *bio, int error,
@@ -173,7 +175,7 @@ void *dm_get_mdptr(struct mapped_device
/*
* A device can still be used while suspended, but I/O is deferred.
*/
-int dm_suspend(struct mapped_device *md, int with_lockfs);
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags);
int dm_resume(struct mapped_device *md);
/*
@@ -188,6 +190,7 @@ int dm_wait_event(struct mapped_device *
const char *dm_device_name(struct mapped_device *md);
struct gendisk *dm_disk(struct mapped_device *md);
int dm_suspended(struct mapped_device *md);
+int dm_noflush_suspending(struct dm_target *ti);
/*
* Geometry functions.
Index: linux-2.6.18-rc7/drivers/md/dm.h
===================================================================
--- linux-2.6.18-rc7.orig/drivers/md/dm.h 2006-09-14 23:24:03.000000000 +0100
+++ linux-2.6.18-rc7/drivers/md/dm.h 2006-10-06 18:47:22.000000000 +0100
@@ -33,6 +33,25 @@
#define SECTOR_SHIFT 9
/*
+ * Definitions of return values from target end_io function.
+ */
+#define DM_ENDIO_INCOMPLETE 1
+#define DM_ENDIO_REQUEUE 2
+
+/*
+ * Definitions of return values from target map function.
+ */
+#define DM_MAPIO_SUBMITTED 0
+#define DM_MAPIO_REMAPPED 1
+#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
+
+/*
+ * Suspend feature flags
+ */
+#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
+#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
+
+/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev {
Index: linux-2.6.18-rc7/drivers/md/dm-bio-list.h
===================================================================
--- linux-2.6.18-rc7.orig/drivers/md/dm-bio-list.h 2006-10-06 14:42:08.000000000 +0100
+++ linux-2.6.18-rc7/drivers/md/dm-bio-list.h 2006-10-06 18:47:22.000000000 +0100
@@ -44,6 +44,20 @@ static inline void bio_list_merge(struct
bl->tail = bl2->tail;
}
+static inline void bio_list_merge_head(struct bio_list *bl,
+ struct bio_list *bl2)
+{
+ if (!bl2->head)
+ return;
+
+ if (bl->head)
+ bl2->tail->bi_next = bl->head;
+ else
+ bl->tail = bl2->tail;
+
+ bl->head = bl2->head;
+}
+
static inline struct bio *bio_list_pop(struct bio_list *bl)
{
struct bio *bio = bl->head;
Index: linux-2.6.18-rc7/include/linux/dm-ioctl.h
===================================================================
--- linux-2.6.18-rc7.orig/include/linux/dm-ioctl.h 2006-09-14 23:24:03.000000000 +0100
+++ linux-2.6.18-rc7/include/linux/dm-ioctl.h 2006-10-06 18:47:22.000000000 +0100
@@ -285,7 +285,7 @@ typedef char ioctl_struct[308];
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
-#define DM_VERSION_MINOR 10
+#define DM_VERSION_MINOR 11
#define DM_VERSION_PATCHLEVEL 0
#define DM_VERSION_EXTRA "-ioctl (2006-09-14)"
@@ -323,4 +323,9 @@ typedef char ioctl_struct[308];
*/
#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */
+/*
+ * Set this to suspend without flushing queued ios.
+ */
+#define DM_NOFLUSH_FLAG (1 << 11) /* In */
+
#endif /* _LINUX_DM_IOCTL_H */
Index: linux-2.6.18-rc7/drivers/md/dm-mpath.c
===================================================================
--- linux-2.6.18-rc7.orig/drivers/md/dm-mpath.c 2006-10-06 14:42:08.000000000 +0100
+++ linux-2.6.18-rc7/drivers/md/dm-mpath.c 2006-10-06 18:47:22.000000000 +0100
@@ -282,10 +282,13 @@ failed:
m->current_pg = NULL;
}
+#define __PUSHBACK(m) (!(m)->queue_if_no_path && (m)->saved_queue_if_no_path && \
+ dm_noflush_suspending((m)->ti))
+
static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
unsigned was_queued)
{
- int r = 1;
+ int r = DM_MAPIO_REMAPPED;
unsigned long flags;
struct pgpath *pgpath;
@@ -310,10 +313,13 @@ static int map_io(struct multipath *m, s
!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
- r = 0;
- } else if (!pgpath)
+ r = DM_MAPIO_SUBMITTED;
+ } else if (!pgpath) {
r = -EIO; /* Failed */
- else
+
+ if (__PUSHBACK(m))
+ r = DM_MAPIO_REQUEUE;
+ } else
bio->bi_bdev = pgpath->path.dev->bdev;
mpio->pgpath = pgpath;
@@ -372,8 +378,16 @@ static void dispatch_queued_ios(struct m
r = map_io(m, bio, mpio, 1);
if (r < 0)
bio_endio(bio, bio->bi_size, r);
- else if (r == 1)
+ else if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
+ else if (r == DM_MAPIO_REQUEUE)
+ /*
+ * end_io handles the requeue request by
+ * returning the bio with error status.
+ * We don't return the r value to end_io,
+ * since it is probably not needed.
+ */
+ bio_endio(bio, bio->bi_size, -EIO);
bio = next;
}
@@ -781,7 +795,7 @@ static int multipath_map(struct dm_targe
map_context->ptr = mpio;
bio->bi_rw |= (1 << BIO_RW_FAILFAST);
r = map_io(m, bio, mpio, 0);
- if (r < 0)
+ if (r < 0 || r == DM_MAPIO_REQUEUE)
mempool_free(mpio, m->mpio_pool);
return r;
@@ -1005,7 +1019,10 @@ static int do_end_io(struct multipath *m
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
- if (!m->queue_if_no_path) {
+ if (__PUSHBACK(m)) {
+ spin_unlock_irqrestore(&m->lock, flags);
+ return DM_ENDIO_REQUEUE;
+ } else if (!m->queue_if_no_path) {
spin_unlock_irqrestore(&m->lock, flags);
return -EIO;
} else {
@@ -1040,7 +1057,7 @@ static int do_end_io(struct multipath *m
queue_work(kmultipathd, &m->process_queued_ios);
spin_unlock_irqrestore(&m->lock, flags);
- return 1; /* io not complete */
+ return DM_ENDIO_INCOMPLETE; /* io not complete */
}
static int multipath_end_io(struct dm_target *ti, struct bio *bio,
@@ -1058,7 +1075,7 @@ static int multipath_end_io(struct dm_ta
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path);
}
- if (r <= 0)
+ if (r != DM_ENDIO_INCOMPLETE)
mempool_free(mpio, m->mpio_pool);
return r;