---
drivers/md/dm-regions.c | 634 ++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/dm-regions.h | 75 +++++
2 files changed, 709 insertions(+)
Index: linux/drivers/md/dm-regions.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/md/dm-regions.c 2007-06-06 20:40:09.000000000 +0100
@@ -0,0 +1,634 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2007 Red Hat Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-log.h"
+#include "dm-regions.h"
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "region hash"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The set splits itself up into discrete regions.
+ * Each region can be in one of three states:
+ *
+ * o clean
+ * o dirty,
+ * o nosync.
+ *
+ * There is no need to put clean regions in the hash.
+ *
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ * clean_regions: Regions on this list have no io pending to
+ * them, they are in sync, we are no longer interested in them,
+ * they are dull. rh_update_states() will remove them from the
+ * hash table.
+ *
+ * quiesced_regions: These regions have been spun down, ready
+ * for recovery. rh_recovery_start() will remove regions from
+ * this list and hand them to kmirrord, which will schedule the
+ * recovery io with kcopyd.
+ *
+ * recovered_regions: Regions that kcopyd has successfully
+ * recovered. rh_update_states() will now schedule any delayed
+ * io, up the recovery_count, and remove the region from the hash.
+ *
+ * There are 2 locks:
+ * A rw spin lock 'hash_lock' protects just the hash table,
+ * this is never held in write mode from interrupt context,
+ * which I believe means that we only have to disable irqs when
+ * doing a write lock.
+ *
+ * An ordinary spin lock 'region_lock' that protects the three
+ * lists in the region_hash, with the 'state', 'list' and
+ * 'delayed_bios' fields of the regions. This is used from irq
+ * context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+enum region_hash_flags {
+ RECOVERY,
+};
+
+struct region_hash {
+ unsigned max_recovery; /* Max # of regions to recover in parallel */
+ unsigned long flags;
+
+ /* Callback function to dispatch queued writes on recovered regions. */
+ void (*dispatch)(void *context, struct bio_list *bios);
+ void *dispatch_context;
+
+ /* Callback function to wakeup callers worker thread. */
+ void (*wake)(void *context);
+ void *wake_context;
+
+ uint32_t region_size;
+ unsigned region_shift;
+
+ /* holds persistent region state */
+ struct dirty_log *log;
+
+ /* hash table */
+ rwlock_t hash_lock;
+ mempool_t *region_pool;
+ unsigned mask;
+ unsigned nr_buckets;
+ struct list_head *buckets;
+
+ spinlock_t region_lock;
+ struct semaphore recovery_count;
+ struct list_head clean_regions;
+ struct list_head quiesced_regions;
+ struct list_head recovered_regions;
+};
+
+struct region {
+ struct region_hash *rh; /* FIXME: can we get rid of this ? */
+ region_t key;
+ int state;
+ void *context; /* Caller context. */
+
+ struct list_head hash_list;
+ struct list_head list;
+
+ atomic_t pending;
+ struct bio_list delayed_bios;
+};
+
+/*
+ * Conversion fns
+ */
+region_t rh_sector_to_region(void *rh, sector_t sector)
+{
+ return sector >> ((struct region_hash*) rh)->region_shift;
+}
+EXPORT_SYMBOL(rh_sector_to_region);
+
+region_t rh_bio_to_region(void *rh, struct bio *bio)
+{
+ return rh_sector_to_region(rh, bio->bi_sector);
+}
+EXPORT_SYMBOL(rh_bio_to_region);
+
+sector_t rh_region_to_sector(void *rh, region_t region)
+{
+ return region << ((struct region_hash*) rh)->region_shift;
+}
+EXPORT_SYMBOL(rh_region_to_sector);
+
+/*
+ * Retrival fns.
+ */
+region_t rh_get_region_key(void *reg)
+{
+ return ((struct region *)reg)->key;
+}
+EXPORT_SYMBOL(rh_get_region_key);
+
+sector_t rh_get_region_size(void *rh)
+{
+ return ((struct region_hash *)rh)->region_size;
+}
+EXPORT_SYMBOL(rh_get_region_size);
+
+/* Squirrel a context with a region. */
+void *rh_reg_get_context(void *reg)
+{
+ return ((struct region*) reg)->context;
+}
+EXPORT_SYMBOL(rh_reg_get_context);
+
+void rh_reg_set_context(void *reg, void *context)
+{
+ ((struct region*) reg)->context = context;
+}
+EXPORT_SYMBOL(rh_reg_set_context);
+
+/*
+ * Region struct allocation/free.
+ */
+static void *region_alloc(unsigned gfp_mask, void *pool_data)
+{
+ return kmalloc(sizeof(struct region), gfp_mask);
+}
+
+static void region_free(void *element, void *pool_data)
+{
+ kfree(element);
+}
+
+#define MIN_REGIONS 64
+int rh_init(void **region_hash,
+ unsigned max_recovery,
+ void (*dispatch)(void *dispatch_context, struct bio_list *bios),
+ void *dispatch_context,
+ void (*wake)(void *wake_context),
+ void *wake_context,
+ struct dirty_log *log, uint32_t region_size, region_t nr_regions)
+{
+ unsigned nr_buckets, max_buckets;
+ size_t i;
+ struct region_hash *rh;
+
+ if (region_size & (region_size - 1)) {
+ DMERR("region size must be 2^^n");
+ return -EINVAL;
+ }
+
+ rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+ if (!rh) {
+ DMERR("unable to allocate region hash memory");
+ return -ENOMEM;
+ }
+
+ /*
+ * Calculate a suitable number of buckets for our hash
+ * table.
+ */
+ max_buckets = nr_regions >> 6;
+ for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1);
+ nr_buckets >>= 1;
+
+ rh->max_recovery = max_recovery;
+ rh->dispatch = dispatch;
+ rh->dispatch_context = dispatch_context;
+ rh->wake = wake;
+ rh->wake_context = wake_context;
+ rh->log = log;
+ rh->region_size = region_size;
+ rh->region_shift = ffs(region_size) - 1;
+ rwlock_init(&rh->hash_lock);
+ rh->mask = nr_buckets - 1;
+ rh->nr_buckets = nr_buckets;
+
+ rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+ if (!rh->buckets) {
+ DMERR("unable to allocate region hash bucket memory");
+ vfree(rh);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_buckets; i++)
+ INIT_LIST_HEAD(rh->buckets + i);
+
+ spin_lock_init(&rh->region_lock);
+ sema_init(&rh->recovery_count, 0);
+ INIT_LIST_HEAD(&rh->clean_regions);
+ INIT_LIST_HEAD(&rh->quiesced_regions);
+ INIT_LIST_HEAD(&rh->recovered_regions);
+
+ rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
+ region_free, NULL);
+ if (!rh->region_pool) {
+ vfree(rh->buckets);
+ vfree(rh);
+ return -ENOMEM;
+ }
+
+ *region_hash = rh;
+
+ return 0;
+}
+EXPORT_SYMBOL(rh_init);
+
+void rh_exit(void *v)
+{
+ unsigned h;
+ struct region *reg, *tmp;
+ struct region_hash *rh = v;
+
+ BUG_ON(!list_empty(&rh->quiesced_regions));
+
+ for (h = 0; h < rh->nr_buckets; h++) {
+ list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) {
+ BUG_ON(atomic_read(®->pending));
+ mempool_free(reg, rh->region_pool);
+ }
+ }
+
+ dm_destroy_dirty_log(rh->log);
+
+ if (rh->region_pool)
+ mempool_destroy(rh->region_pool);
+
+ vfree(rh->buckets);
+ kfree(rh);
+}
+EXPORT_SYMBOL(rh_exit);
+
+#define RH_HASH_MULT 2654435387U
+
+static inline unsigned rh_hash(struct region_hash *rh, region_t region)
+{
+ return (unsigned) ((region * RH_HASH_MULT) >> 12) & rh->mask;
+}
+
+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
+{
+ struct region *reg;
+
+ list_for_each_entry(reg, rh->buckets + rh_hash(rh, region), hash_list) {
+ if (reg->key == region)
+ return reg;
+ }
+
+ return NULL;
+}
+
+static void __rh_insert(struct region_hash *rh, struct region *reg)
+{
+ unsigned h = rh_hash(rh, reg->key);
+ list_add(®->hash_list, rh->buckets + h);
+}
+
+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
+{
+ struct region *reg, *nreg;
+
+ read_unlock(&rh->hash_lock);
+ nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+ nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+ RH_CLEAN : RH_NOSYNC;
+ nreg->rh = rh;
+ nreg->key = region;
+
+ INIT_LIST_HEAD(&nreg->list);
+
+ atomic_set(&nreg->pending, 0);
+ bio_list_init(&nreg->delayed_bios);
+
+ write_lock_irq(&rh->hash_lock);
+
+ reg = __rh_lookup(rh, region);
+ if (reg)
+ /* we lost the race */
+ mempool_free(nreg, rh->region_pool);
+ else {
+ __rh_insert(rh, nreg);
+ if (nreg->state == RH_CLEAN) {
+ spin_lock(&rh->region_lock);
+ list_add(&nreg->list, &rh->clean_regions);
+ spin_unlock(&rh->region_lock);
+ }
+ reg = nreg;
+ }
+
+ write_unlock_irq(&rh->hash_lock);
+ read_lock(&rh->hash_lock);
+
+ return reg;
+}
+
+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
+{
+ struct region *reg;
+
+ reg = __rh_lookup(rh, region);
+ if (!reg)
+ reg = __rh_alloc(rh, region);
+
+ return reg;
+}
+
+int rh_state(void *v, region_t region, int may_block)
+{
+ int r = 0;
+ struct region *reg;
+ struct region_hash *rh = v;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_lookup(rh, region);
+ if (reg)
+ r = reg->state;
+ read_unlock(&rh->hash_lock);
+
+ if (r)
+ return r;
+
+ /*
+ * The region wasn't in the hash, so we fall back to the dirty log.
+ */
+ r = rh->log->type->in_sync(rh->log, region, may_block);
+
+ /*
+ * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+ * taken as a RH_NOSYNC
+ */
+ return r == 1 ? RH_CLEAN : RH_NOSYNC;
+}
+EXPORT_SYMBOL(rh_state);
+
+void rh_update_states(void *v)
+{
+ struct region *reg, *next;
+ struct region_hash *rh = v;
+ LIST_HEAD(clean);
+ LIST_HEAD(recovered);
+
+ /*
+ * Quickly grab the lists.
+ */
+ write_lock_irq(&rh->hash_lock);
+ spin_lock(&rh->region_lock);
+ if (!list_empty(&rh->clean_regions)) {
+ list_splice(&rh->clean_regions, &clean);
+ INIT_LIST_HEAD(&rh->clean_regions);
+
+ list_for_each_entry(reg, &clean, list) {
+ rh->log->type->clear_region(rh->log, reg->key);
+ list_del(®->hash_list);
+ }
+ }
+
+ if (!list_empty(&rh->recovered_regions)) {
+ list_splice(&rh->recovered_regions, &recovered);
+ INIT_LIST_HEAD(&rh->recovered_regions);
+
+ list_for_each_entry(reg, &recovered, list)
+ list_del(®->hash_list);
+ }
+
+ spin_unlock(&rh->region_lock);
+ write_unlock_irq(&rh->hash_lock);
+
+ /*
+ * All the regions on the recovered and clean lists have
+ * now been pulled out of the system, so no need to do
+ * any more locking.
+ */
+ list_for_each_entry_safe (reg, next, &recovered, list) {
+ if (reg->state != RH_ERROR)
+ rh->log->type->clear_region(rh->log, reg->key);
+
+ rh->log->type->set_region_sync(rh->log, reg->key,
+ reg->state != RH_ERROR);
+ up(&rh->recovery_count);
+ if (reg->delayed_bios.head)
+ rh->dispatch(rh->dispatch_context, ®->delayed_bios);
+
+ mempool_free(reg, rh->region_pool);
+ }
+
+ if (!list_empty(&recovered))
+ rh_flush(rh);
+
+ list_for_each_entry_safe(reg, next, &clean, list)
+ mempool_free(reg, rh->region_pool);
+}
+EXPORT_SYMBOL(rh_update_states);
+
+void rh_inc(void *v, region_t region)
+{
+ struct region *reg;
+ struct region_hash *rh = v;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ if (reg->state == RH_CLEAN) {
+ rh->log->type->mark_region(rh->log, reg->key);
+
+ spin_lock_irq(&rh->region_lock);
+ reg->state = RH_DIRTY;
+ list_del_init(®->list); /* Take off the clean list. */
+ spin_unlock_irq(&rh->region_lock);
+ }
+
+ atomic_inc(®->pending);
+ read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL(rh_inc);
+
+void rh_inc_pending(void *v, struct bio_list *bios)
+{
+ struct bio *bio;
+ struct region_hash *rh = v;
+
+ for (bio = bios->head; bio; bio = bio->bi_next)
+ rh_inc(rh, rh_bio_to_region(rh, bio));
+}
+EXPORT_SYMBOL(rh_inc_pending);
+
+void rh_dec(void *v, region_t region)
+{
+ unsigned long flags;
+ struct region *reg;
+ struct region_hash *rh = v;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_lookup(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ BUG_ON(!reg);
+
+ if (atomic_dec_and_test(®->pending)) {
+ spin_lock_irqsave(&rh->region_lock, flags);
+ if (reg->state == RH_RECOVERING) {
+ list_add_tail(®->list, &rh->quiesced_regions);
+ } else {
+ reg->state = RH_CLEAN;
+ list_add(®->list, &rh->clean_regions);
+ }
+ spin_unlock_irqrestore(&rh->region_lock, flags);
+ }
+}
+EXPORT_SYMBOL(rh_dec);
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct region_hash *rh)
+{
+ int r;
+ struct region *reg;
+ region_t region;
+
+ /*
+ * Ask the dirty log what's next.
+ */
+ r = rh->log->type->get_resync_work(rh->log, ®ion);
+ if (r <= 0)
+ return r;
+
+ /*
+ * Get this region, and start it quiescing
+ * by setting the recovering flag.
+ */
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ read_unlock(&rh->hash_lock);
+
+ spin_lock_irq(&rh->region_lock);
+
+ reg->state = RH_RECOVERING;
+
+ /* Already quiesced ? */
+ list_del_init(®->list);
+ if (!atomic_read(®->pending))
+ list_add(®->list, &rh->quiesced_regions);
+
+ spin_unlock_irq(&rh->region_lock);
+
+ return 1;
+}
+
+int rh_recovery_prepare(void *v)
+{
+ struct region_hash *rh = v;
+
+ if (test_bit(RECOVERY, &rh->flags)) {
+ while (!down_trylock(&rh->recovery_count)) {
+ if (__rh_recovery_prepare(rh) <= 0) {
+ up(&rh->recovery_count);
+ return -ENOENT;
+ }
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(rh_recovery_prepare);
+
+/*
+ * Returns any quiesced regions.
+ */
+void *rh_recovery_start(void *v)
+{
+ struct region *reg = NULL;
+ struct region_hash *rh = v;
+
+ spin_lock_irq(&rh->region_lock);
+ if (!list_empty(&rh->quiesced_regions)) {
+ reg = list_entry(rh->quiesced_regions.next,
+ struct region, list);
+ list_del_init(®->list); /* Remove from the quiesced list. */
+ }
+ spin_unlock_irq(&rh->region_lock);
+
+ return (void*) reg;
+}
+EXPORT_SYMBOL(rh_recovery_start);
+
+/*
+ * Put region on list of recovered ones.
+ */
+void rh_recovery_end(void *v, int error)
+{
+ struct region *reg = v;
+ struct region_hash *rh = reg->rh;
+
+ if (error)
+ reg->state = RH_ERROR;
+
+ spin_lock_irq(&rh->region_lock);
+ list_add(®->list, &rh->recovered_regions);
+ spin_unlock_irq(&rh->region_lock);
+}
+EXPORT_SYMBOL(rh_recovery_end);
+
+void rh_flush(void *v)
+{
+ struct region_hash *rh = v;
+
+ rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL(rh_flush);
+
+void rh_delay_by_region(void *v, struct bio *bio, region_t region)
+{
+ struct region_hash *rh = v;
+ struct region *reg;
+
+ read_lock(&rh->hash_lock);
+ reg = __rh_find(rh, region);
+ bio_list_add(®->delayed_bios, bio);
+ read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL(rh_delay_by_region);
+
+void rh_delay(void *v, struct bio *bio)
+{
+ return rh_delay_by_region(v, bio, rh_bio_to_region(v, bio));
+}
+EXPORT_SYMBOL(rh_delay);
+
+void rh_stop_recovery(void *v)
+{
+ int i;
+ struct region_hash *rh = v;
+
+ clear_bit(RECOVERY, &rh->flags);
+ rh->wake(rh->wake_context);
+
+ /* wait for any recovering regions */
+ for (i = 0; i < rh->max_recovery; i++)
+ down(&rh->recovery_count);
+}
+EXPORT_SYMBOL(rh_stop_recovery);
+
+void rh_start_recovery(void *v)
+{
+ int i;
+ struct region_hash *rh = v;
+
+ set_bit(RECOVERY, &rh->flags);
+ for (i = 0; i < rh->max_recovery; i++)
+ up(&rh->recovery_count);
+
+ rh->wake(rh->wake_context);
+}
+EXPORT_SYMBOL(rh_start_recovery);
+
+MODULE_DESCRIPTION(DM_NAME " region hash");
+MODULE_AUTHOR("Heinz Mauelshagen");
+MODULE_LICENSE("GPL");
Index: linux/drivers/md/dm-regions.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/drivers/md/dm-regions.h 2007-06-06 20:40:09.000000000 +0100
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2007 Red Hat Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_REGIONS_H
+#define DM_REGIONS_H
+
+#include "dm.h"
+#include "dm-log.h"
+#include "dm-bio-list.h"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *----------------------------------------------------------------*/
+
+/*
+ * States a region can have.
+ */
+enum {
+ RH_CLEAN = 0x01, /* No writes in flight. */
+ RH_DIRTY = 0x02, /* Writes in flight. */
+ RH_NOSYNC = 0x04, /* Out of sync. */
+ RH_RECOVERING = 0x08, /* Under resynchronization. */
+ RH_ERROR = 0x10, /* Error recovering region */
+};
+
+/*
+ * Conversion fns
+ */
+region_t rh_bio_to_region(void *rh, struct bio *bio);
+region_t rh_sector_to_region(void *rh, sector_t sector);
+sector_t rh_region_to_sector(void *rh, region_t region);
+
+
+/*
+ * Functions to set a caller context in a region.
+ */
+void *rh_reg_get_context(void *reg);
+void rh_reg_set_context(void *reg, void *context);
+
+/*
+ * Reagion hash and region parameters.
+ */
+region_t rh_get_region_size(void *rh);
+sector_t rh_get_region_key(void *reg);
+
+int rh_init(void **rh,
+ unsigned max_recovery,
+ void (*dispatch)(void *dispatch_context, struct bio_list *bios),
+ void *dispatch_context,
+ void (*wake)(void *wake_context),
+ void *wake_context,
+ struct dirty_log *log, uint32_t region_size, region_t nr_regions);
+void rh_exit(void *rh);
+
+int rh_state(void *rh, region_t region, int may_block);
+void rh_update_states(void *rh);
+void rh_flush(void *rh);
+
+void rh_inc(void *rh, region_t region);
+void rh_inc_pending(void *rh, struct bio_list *bios);
+void rh_dec(void *rh, region_t region);
+void rh_delay(void *rh, struct bio *bio);
+void rh_delay_by_region(void *rh, struct bio *bio, region_t region);
+
+int rh_recovery_prepare(void *rh);
+void *rh_recovery_start(void *rh);
+void rh_recovery_end(void *reg, int error);
+void rh_stop_recovery(void *rh);
+void rh_start_recovery(void *rh);
+
+#endif