2.5 backport
--- diff/Documentation/Configure.help 2002-11-29 09:36:43.000000000 +0000
+++ source/Documentation/Configure.help 2002-11-29 09:44:59.000000000 +0000
@@ -1822,6 +1822,20 @@
want), say M here and read <file:Documentation/modules.txt>. The
module will be called lvm-mod.o.
+Device-mapper support
+CONFIG_BLK_DEV_DM
+ Device-mapper is a low level volume manager. It works by allowing
+ people to specify mappings for ranges of logical sectors. Various
+ mapping types are available, in addition people may write their own
+ modules containing custom mappings if they wish.
+
+ Higher level volume managers such as LVM2 use this driver.
+
+ If you want to compile this as a module, say M here and read
+ <file:Documentation/modules.txt>. The module will be called dm-mod.o.
+
+ If unsure, say N.
+
Multiple devices driver support (RAID and LVM)
CONFIG_MD
Support multiple physical spindles through a single logical device.
--- diff/MAINTAINERS 2002-11-29 09:36:43.000000000 +0000
+++ source/MAINTAINERS 2002-11-29 09:44:59.000000000 +0000
@@ -439,6 +439,13 @@
W: http://www.debian.org/~dz/i8k/
S: Maintained
+DEVICE MAPPER
+P: Joe Thornber
+M: dm@uk.sistina.com
+L: linux-LVM@sistina.com
+W: http://www.sistina.com/lvm
+S: Maintained
+
DEVICE NUMBER REGISTRY
P: H. Peter Anvin
M: hpa@zytor.com
--- diff/drivers/md/Config.in 2001-09-26 16:15:05.000000000 +0100
+++ source/drivers/md/Config.in 2002-11-29 09:44:59.000000000 +0000
@@ -14,5 +14,8 @@
dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD
+fi
endmenu
--- diff/drivers/md/Makefile 2002-01-17 10:07:52.000000000 +0000
+++ source/drivers/md/Makefile 2002-11-29 09:44:59.000000000 +0000
@@ -4,9 +4,12 @@
O_TARGET := mddev.o
-export-objs := md.o xor.o
+export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o
list-multi := lvm-mod.o
lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o
+dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \
+ dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
+ kcopyd.o
# Note: link order is important. All raid personalities
# and xor.o must come before md.o, as they each initialise
@@ -20,8 +23,12 @@
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_BLK_DEV_MD) += md.o
obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
+obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
include $(TOPDIR)/Rules.make
lvm-mod.o: $(lvm-mod-objs)
$(LD) -r -o $@ $(lvm-mod-objs)
+
+dm-mod.o: $(dm-mod-objs)
+ $(LD) -r -o $@ $(dm-mod-objs)
--- diff/fs/buffer.c 2002-11-29 09:36:46.000000000 +0000
+++ source/fs/buffer.c 2002-11-29 09:44:59.000000000 +0000
@@ -586,9 +586,10 @@
void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
{
spin_lock(&lru_list_lock);
- if (bh->b_inode)
+ if (buffer_inode(bh))
list_del(&bh->b_inode_buffers);
- bh->b_inode = inode;
+ else
+ set_buffer_inode(bh);
list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
spin_unlock(&lru_list_lock);
}
@@ -596,9 +597,10 @@
void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
{
spin_lock(&lru_list_lock);
- if (bh->b_inode)
+ if (buffer_inode(bh))
list_del(&bh->b_inode_buffers);
- bh->b_inode = inode;
+ else
+ set_buffer_inode(bh);
list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
spin_unlock(&lru_list_lock);
}
@@ -607,13 +609,13 @@
remove_inode_queue functions. */
static void __remove_inode_queue(struct buffer_head *bh)
{
- bh->b_inode = NULL;
+ clear_buffer_inode(bh);
list_del(&bh->b_inode_buffers);
}
static inline void remove_inode_queue(struct buffer_head *bh)
{
- if (bh->b_inode)
+ if (buffer_inode(bh))
__remove_inode_queue(bh);
}
@@ -741,6 +743,7 @@
bh->b_list = BUF_CLEAN;
bh->b_end_io = handler;
bh->b_private = private;
+ bh->b_journal_head = NULL;
}
static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
@@ -842,9 +845,9 @@
bh = BH_ENTRY(list->next);
list_del(&bh->b_inode_buffers);
if (!buffer_dirty(bh) && !buffer_locked(bh))
- bh->b_inode = NULL;
+ clear_buffer_inode(bh);
else {
- bh->b_inode = &tmp;
+ set_buffer_inode(bh);
list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
if (buffer_dirty(bh)) {
get_bh(bh);
@@ -1138,7 +1141,7 @@
*/
static void __put_unused_buffer_head(struct buffer_head * bh)
{
- if (bh->b_inode)
+ if (buffer_inode(bh))
BUG();
if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
kmem_cache_free(bh_cachep, bh);
--- diff/fs/jbd/journal.c 2002-11-29 09:36:46.000000000 +0000
+++ source/fs/jbd/journal.c 2002-11-29 09:44:59.000000000 +0000
@@ -1664,8 +1664,8 @@
*
* Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
* is set. This bit is tested in core kernel code where we need to take
- * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
- * there.
+ * JBD-specific actions. Testing the zeroness of ->b_journal_head is not
+ * reliable there.
*
* When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
*
@@ -1720,9 +1720,9 @@
if (buffer_jbd(bh)) {
/* Someone did it for us! */
- J_ASSERT_BH(bh, bh->b_private != NULL);
+ J_ASSERT_BH(bh, bh->b_journal_head != NULL);
journal_free_journal_head(jh);
- jh = bh->b_private;
+ jh = bh->b_journal_head;
} else {
/*
* We actually don't need jh_splice_lock when
@@ -1730,7 +1730,7 @@
*/
spin_lock(&jh_splice_lock);
set_bit(BH_JBD, &bh->b_state);
- bh->b_private = jh;
+ bh->b_journal_head = jh;
jh->b_bh = bh;
atomic_inc(&bh->b_count);
spin_unlock(&jh_splice_lock);
@@ -1739,7 +1739,7 @@
}
jh->b_jcount++;
spin_unlock(&journal_datalist_lock);
- return bh->b_private;
+ return bh->b_journal_head;
}
/*
@@ -1772,7 +1772,7 @@
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
spin_lock(&jh_splice_lock);
- bh->b_private = NULL;
+ bh->b_journal_head = NULL;
jh->b_bh = NULL; /* debug, really */
clear_bit(BH_JBD, &bh->b_state);
__brelse(bh);
--- diff/include/linux/fs.h 2002-11-29 09:36:47.000000000 +0000
+++ source/include/linux/fs.h 2002-11-29 09:44:59.000000000 +0000
@@ -220,6 +220,7 @@
BH_Wait_IO, /* 1 if we should write out this buffer */
BH_Launder, /* 1 if we can throttle on this buffer */
BH_JBD, /* 1 if it has an attached journal_head */
+ BH_Inode, /* 1 if it is attached to i_dirty[_data]_buffers */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -262,11 +263,10 @@
struct page *b_page; /* the page this bh is mapped to */
void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
void *b_private; /* reserved for b_end_io */
-
+ void *b_journal_head; /* ext3 journal_heads */
unsigned long b_rsector; /* Real buffer location on disk */
wait_queue_head_t b_wait;
- struct inode * b_inode;
struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */
};
@@ -1186,6 +1186,21 @@
clear_bit(BH_Async, &bh->b_state);
}
+static inline void set_buffer_inode(struct buffer_head *bh)
+{
+ set_bit(BH_Inode, &bh->b_state);
+}
+
+static inline void clear_buffer_inode(struct buffer_head *bh)
+{
+ clear_bit(BH_Inode, &bh->b_state);
+}
+
+static inline int buffer_inode(struct buffer_head *bh)
+{
+ return test_bit(BH_Inode, &bh->b_state);
+}
+
/*
* If an error happens during the make_request, this function
* has to be recalled. It marks the buffer as clean and not
--- diff/include/linux/jbd.h 2002-11-29 09:36:47.000000000 +0000
+++ source/include/linux/jbd.h 2002-11-29 09:44:59.000000000 +0000
@@ -254,7 +254,7 @@
static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
- return bh->b_private;
+ return bh->b_journal_head;
}
#define HAVE_JOURNAL_CALLBACK_STATUS
--- diff/include/linux/vmalloc.h 2002-11-29 09:36:47.000000000 +0000
+++ source/include/linux/vmalloc.h 2002-11-29 09:44:59.000000000 +0000
@@ -26,6 +26,7 @@
extern void vmfree_area_pages(unsigned long address, unsigned long size);
extern int vmalloc_area_pages(unsigned long address, unsigned long size,
int gfp_mask, pgprot_t prot);
+extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
/*
* Allocate any pages
--- diff/kernel/ksyms.c 2002-11-29 09:36:47.000000000 +0000
+++ source/kernel/ksyms.c 2002-11-29 09:44:59.000000000 +0000
@@ -111,6 +111,7 @@
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(__vmalloc);
EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vcalloc);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(remap_page_range);
EXPORT_SYMBOL(max_mapnr);
--- diff/mm/Makefile 2002-08-05 14:57:44.000000000 +0100
+++ source/mm/Makefile 2002-11-29 09:44:59.000000000 +0000
@@ -9,12 +9,12 @@
O_TARGET := mm.o
-export-objs := shmem.o filemap.o memory.o page_alloc.o
+export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
- shmem.o
+ shmem.o mempool.o
obj-$(CONFIG_HIGHMEM) += highmem.o
--- diff/mm/vmalloc.c 2002-11-29 09:36:47.000000000 +0000
+++ source/mm/vmalloc.c 2002-11-29 09:44:59.000000000 +0000
@@ -327,3 +327,22 @@
read_unlock(&vmlist_lock);
return buf - buf_start;
}
+
+void *vcalloc(unsigned long nmemb, unsigned long elem_size)
+{
+ unsigned long size;
+ void *addr;
+
+ /*
+ * Check that we're not going to overflow.
+ */
+ if (nmemb > (ULONG_MAX / elem_size))
+ return NULL;
+
+ size = nmemb * elem_size;
+ addr = vmalloc(size);
+ if (addr)
+ memset(addr, 0, size);
+
+ return addr;
+}
--- diff/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-exception-store.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,701 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-snapshot.h"
+#include "kcopyd.h"
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#define SECTOR_SIZE 512
+#define SECTOR_SHIFT 9
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device. The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store. It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable. It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format. The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+struct disk_header {
+ uint32_t magic;
+
+ /*
+ * Is this snapshot valid. There is no way of recovering
+ * an invalid snapshot.
+ */
+ int valid;
+
+ /*
+ * Simple, incrementing version. no backward
+ * compatibility.
+ */
+ uint32_t version;
+
+ /* In sectors */
+ uint32_t chunk_size;
+};
+
+struct disk_exception {
+ uint64_t old_chunk;
+ uint64_t new_chunk;
+};
+
+struct commit_callback {
+ void (*callback)(void *, int success);
+ void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+ struct dm_snapshot *snap; /* up pointer to my snapshot */
+ int version;
+ int valid;
+ uint32_t chunk_size;
+ uint32_t exceptions_per_area;
+
+ /*
+ * Now that we have an asynchronous kcopyd there is no
+ * need for large chunk sizes, so it wont hurt to have a
+ * whole chunks worth of metadata in memory at once.
+ */
+ void *area;
+ struct kiobuf *iobuf;
+
+ /*
+ * Used to keep track of which metadata area the data in
+ * 'chunk' refers to.
+ */
+ uint32_t current_area;
+
+ /*
+ * The next free chunk for an exception.
+ */
+ uint32_t next_free;
+
+ /*
+ * The index of next free exception in the current
+ * metadata area.
+ */
+ uint32_t current_committed;
+
+ atomic_t pending_count;
+ uint32_t callback_count;
+ struct commit_callback *callbacks;
+};
+
+/*
+ * For performance reasons we want to defer writing a committed
+ * exceptions metadata to disk so that we can amortise away this
+ * exensive operation.
+ *
+ * For the initial version of this code we will remain with
+ * synchronous io. There are some deadlock issues with async
+ * that I haven't yet worked out.
+ */
+static int do_io(int rw, struct kcopyd_region *where, struct kiobuf *iobuf)
+{
+ int i, sectors_per_block, nr_blocks, start;
+ int blocksize = get_hardsect_size(where->dev);
+ int status;
+
+ sectors_per_block = blocksize / SECTOR_SIZE;
+
+ nr_blocks = where->count / sectors_per_block;
+ start = where->sector / sectors_per_block;
+
+ for (i = 0; i < nr_blocks; i++)
+ iobuf->blocks[i] = start++;
+
+ iobuf->length = where->count << 9;
+ iobuf->locked = 1;
+
+ status = brw_kiovec(rw, 1, &iobuf, where->dev, iobuf->blocks,
+ blocksize);
+ if (status != (where->count << 9))
+ return -EIO;
+
+ return 0;
+}
+
+static int allocate_iobuf(struct pstore *ps)
+{
+ size_t i, r = -ENOMEM, len, nr_pages;
+ struct page *page;
+
+ len = ps->chunk_size << SECTOR_SHIFT;
+
+ /*
+ * Allocate the chunk_size block of memory that will hold
+ * a single metadata area.
+ */
+ ps->area = vmalloc(len);
+ if (!ps->area)
+ return r;
+
+ if (alloc_kiovec(1, &ps->iobuf))
+ goto bad;
+
+ nr_pages = ps->chunk_size / (PAGE_SIZE / SECTOR_SIZE);
+ r = expand_kiobuf(ps->iobuf, nr_pages);
+ if (r)
+ goto bad;
+
+ /*
+ * We lock the pages for ps->area into memory since they'll be
+ * doing a lot of io.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
+ LockPage(page);
+ ps->iobuf->maplist[i] = page;
+ ps->iobuf->nr_pages++;
+ }
+
+ ps->iobuf->nr_pages = nr_pages;
+ ps->iobuf->offset = 0;
+
+ return 0;
+
+ bad:
+ if (ps->iobuf)
+ free_kiovec(1, &ps->iobuf);
+
+ if (ps->area)
+ vfree(ps->area);
+ ps->iobuf = NULL;
+ return r;
+}
+
+static void free_iobuf(struct pstore *ps)
+{
+ int i;
+
+ for (i = 0; i < ps->iobuf->nr_pages; i++)
+ UnlockPage(ps->iobuf->maplist[i]);
+ ps->iobuf->locked = 0;
+
+ free_kiovec(1, &ps->iobuf);
+ vfree(ps->area);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
+{
+ int r;
+ struct kcopyd_region where;
+
+ where.dev = ps->snap->cow->dev;
+ where.sector = ps->chunk_size * chunk;
+ where.count = ps->chunk_size;
+
+ r = do_io(rw, &where, ps->iobuf);
+ if (r)
+ return r;
+
+ return 0;
+}
+
+/*
+ * Read or write a metadata area. Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, uint32_t area, int rw)
+{
+ int r;
+ uint32_t chunk;
+
+ /* convert a metadata area index to a chunk index */
+ chunk = 1 + ((ps->exceptions_per_area + 1) * area);
+
+ r = chunk_io(ps, chunk, rw);
+ if (r)
+ return r;
+
+ ps->current_area = area;
+ return 0;
+}
+
+static int zero_area(struct pstore *ps, uint32_t area)
+{
+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+ return area_io(ps, area, WRITE);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+ int r;
+ struct disk_header *dh;
+
+ r = chunk_io(ps, 0, READ);
+ if (r)
+ return r;
+
+ dh = (struct disk_header *) ps->area;
+
+ if (dh->magic == 0) {
+ *new_snapshot = 1;
+
+ } else if (dh->magic == SNAP_MAGIC) {
+ *new_snapshot = 0;
+ ps->valid = dh->valid;
+ ps->version = dh->version;
+ ps->chunk_size = dh->chunk_size;
+
+ } else {
+ DMWARN("Invalid/corrupt snapshot");
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+ struct disk_header *dh;
+
+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
+
+ dh = (struct disk_header *) ps->area;
+ dh->magic = SNAP_MAGIC;
+ dh->valid = ps->valid;
+ dh->version = ps->version;
+ dh->chunk_size = ps->chunk_size;
+
+ return chunk_io(ps, 0, WRITE);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+ if (index >= ps->exceptions_per_area)
+ return NULL;
+
+ return ((struct disk_exception *) ps->area) + index;
+}
+
+static int read_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *result)
+{
+ struct disk_exception *e;
+
+ e = get_exception(ps, index);
+ if (!e)
+ return -EINVAL;
+
+ /* copy it */
+ result->old_chunk = le64_to_cpu(e->old_chunk);
+ result->new_chunk = le64_to_cpu(e->new_chunk);
+
+ return 0;
+}
+
+static int write_exception(struct pstore *ps,
+ uint32_t index, struct disk_exception *de)
+{
+ struct disk_exception *e;
+
+ e = get_exception(ps, index);
+ if (!e)
+ return -EINVAL;
+
+ /* copy it */
+ e->old_chunk = cpu_to_le64(de->old_chunk);
+ e->new_chunk = cpu_to_le64(de->new_chunk);
+
+ return 0;
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, int *full)
+{
+ int i, r;
+ struct disk_exception de;
+
+ /* presume the area is full */
+ *full = 1;
+
+ for (i = 0; i < ps->exceptions_per_area; i++) {
+ r = read_exception(ps, i, &de);
+
+ if (r)
+ return r;
+
+ /*
+ * If the new_chunk is pointing at the start of
+ * the COW device, where the first metadata area
+ * is we know that we've hit the end of the
+ * exceptions. Therefore the area is not full.
+ */
+ if (de.new_chunk == 0LL) {
+ ps->current_committed = i;
+ *full = 0;
+ break;
+ }
+
+ /*
+ * Keep track of the start of the free chunks.
+ */
+ if (ps->next_free <= de.new_chunk)
+ ps->next_free = de.new_chunk + 1;
+
+ /*
+ * Otherwise we add the exception to the snapshot.
+ */
+ r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int read_exceptions(struct pstore *ps)
+{
+ uint32_t area;
+ int r, full = 1;
+
+ /*
+ * Keeping reading chunks and inserting exceptions until
+ * we find a partially full area.
+ */
+ for (area = 0; full; area++) {
+ r = area_io(ps, area, READ);
+ if (r)
+ return r;
+
+ r = insert_exceptions(ps, &full);
+ if (r)
+ return r;
+
+ area++;
+ }
+
+ return 0;
+}
+
+static inline struct pstore *get_info(struct exception_store *store)
+{
+ return (struct pstore *) store->context;
+}
+
+static int persistent_percentfull(struct exception_store *store)
+{
+ struct pstore *ps = get_info(store);
+ return (ps->next_free * store->snap->chunk_size * 100) /
+ get_dev_size(store->snap->cow->dev);
+}
+
+static void persistent_destroy(struct exception_store *store)
+{
+ struct pstore *ps = get_info(store);
+
+ vfree(ps->callbacks);
+ free_iobuf(ps);
+ kfree(ps);
+}
+
+static int persistent_prepare(struct exception_store *store,
+ struct exception *e)
+{
+ struct pstore *ps = get_info(store);
+ uint32_t stride;
+ sector_t size = get_dev_size(store->snap->cow->dev);
+
+ /* Is there enough room ? */
+ if (size <= (ps->next_free * store->snap->chunk_size))
+ return -ENOSPC;
+
+ e->new_chunk = ps->next_free;
+
+ /*
+ * Move onto the next free pending, making sure to take
+ * into account the location of the metadata chunks.
+ */
+ stride = (ps->exceptions_per_area + 1);
+ if (!(++ps->next_free % stride))
+ ps->next_free++;
+
+ atomic_inc(&ps->pending_count);
+ return 0;
+}
+
+static void persistent_commit(struct exception_store *store,
+ struct exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context)
+{
+ int r, i;
+ struct pstore *ps = get_info(store);
+ struct disk_exception de;
+ struct commit_callback *cb;
+
+ de.old_chunk = e->old_chunk;
+ de.new_chunk = e->new_chunk;
+ write_exception(ps, ps->current_committed++, &de);
+
+ /*
+ * Add the callback to the back of the array. This code
+ * is the only place where the callback array is
+ * manipulated, and we know that it will never be called
+ * multiple times concurrently.
+ */
+ cb = ps->callbacks + ps->callback_count++;
+ cb->callback = callback;
+ cb->context = callback_context;
+
+ /*
+ * If there are no more exceptions in flight, or we have
+ * filled this metadata area we commit the exceptions to
+ * disk.
+ */
+ if (atomic_dec_and_test(&ps->pending_count) ||
+ (ps->current_committed == ps->exceptions_per_area)) {
+ r = area_io(ps, ps->current_area, WRITE);
+ if (r)
+ ps->valid = 0;
+
+ for (i = 0; i < ps->callback_count; i++) {
+ cb = ps->callbacks + i;
+ cb->callback(cb->context, r == 0 ? 1 : 0);
+ }
+
+ ps->callback_count = 0;
+ }
+
+ /*
+ * Have we completely filled the current area ?
+ */
+ if (ps->current_committed == ps->exceptions_per_area) {
+ ps->current_committed = 0;
+ r = zero_area(ps, ps->current_area + 1);
+ if (r)
+ ps->valid = 0;
+ }
+}
+
+static void persistent_drop(struct exception_store *store)
+{
+ struct pstore *ps = get_info(store);
+
+ ps->valid = 0;
+ if (write_header(ps))
+ DMWARN("write header failed");
+}
+
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+{
+ int r, new_snapshot;
+ struct pstore *ps;
+
+ /* allocate the pstore */
+ ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ return -ENOMEM;
+
+ ps->snap = store->snap;
+ ps->valid = 1;
+ ps->version = SNAPSHOT_DISK_VERSION;
+ ps->chunk_size = chunk_size;
+ ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
+ sizeof(struct disk_exception);
+ ps->next_free = 2; /* skipping the header and first area */
+ ps->current_committed = 0;
+
+ r = allocate_iobuf(ps);
+ if (r)
+ goto bad;
+
+ /*
+ * Allocate space for all the callbacks.
+ */
+ ps->callback_count = 0;
+ atomic_set(&ps->pending_count, 0);
+ ps->callbacks = vcalloc(ps->exceptions_per_area,
+ sizeof(*ps->callbacks));
+
+ if (!ps->callbacks)
+ goto bad;
+
+ /*
+ * Read the snapshot header.
+ */
+ r = read_header(ps, &new_snapshot);
+ if (r)
+ goto bad;
+
+ /*
+ * Do we need to setup a new snapshot ?
+ */
+ if (new_snapshot) {
+ r = write_header(ps);
+ if (r) {
+ DMWARN("write_header failed");
+ goto bad;
+ }
+
+ r = zero_area(ps, 0);
+ if (r) {
+ DMWARN("zero_area(0) failed");
+ goto bad;
+ }
+
+ } else {
+ /*
+ * Sanity checks.
+ */
+ if (ps->chunk_size != chunk_size) {
+ DMWARN("chunk size for existing snapshot different "
+ "from that requested");
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (ps->version != SNAPSHOT_DISK_VERSION) {
+ DMWARN("unable to handle snapshot disk version %d",
+ ps->version);
+ r = -EINVAL;
+ goto bad;
+ }
+
+ /*
+ * Read the metadata.
+ */
+ r = read_exceptions(ps);
+ if (r)
+ goto bad;
+ }
+
+ store->destroy = persistent_destroy;
+ store->prepare_exception = persistent_prepare;
+ store->commit_exception = persistent_commit;
+ store->drop_snapshot = persistent_drop;
+ store->percent_full = persistent_percentfull;
+ store->context = ps;
+
+ return r;
+
+ bad:
+ if (ps) {
+ if (ps->callbacks)
+ vfree(ps->callbacks);
+
+ if (ps->iobuf)
+ free_iobuf(ps);
+
+ kfree(ps);
+ }
+ return r;
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+ sector_t next_free;
+};
+
+void transient_destroy(struct exception_store *store)
+{
+ kfree(store->context);
+}
+
+int transient_prepare(struct exception_store *store, struct exception *e)
+{
+ struct transient_c *tc = (struct transient_c *) store->context;
+ sector_t size = get_dev_size(store->snap->cow->dev);
+
+ if (size < (tc->next_free + store->snap->chunk_size))
+ return -1;
+
+ e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
+ tc->next_free += store->snap->chunk_size;
+
+ return 0;
+}
+
+void transient_commit(struct exception_store *store,
+ struct exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context)
+{
+ /* Just succeed */
+ callback(callback_context, 1);
+}
+
+static int transient_percentfull(struct exception_store *store)
+{
+ struct transient_c *tc = (struct transient_c *) store->context;
+ return (tc->next_free * 100) / get_dev_size(store->snap->cow->dev);
+}
+
+int dm_create_transient(struct exception_store *store,
+ struct dm_snapshot *s, int blocksize)
+{
+ struct transient_c *tc;
+
+ memset(store, 0, sizeof(*store));
+ store->destroy = transient_destroy;
+ store->prepare_exception = transient_prepare;
+ store->commit_exception = transient_commit;
+ store->percent_full = transient_percentfull;
+ store->snap = s;
+
+ tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+ if (!tc)
+ return -ENOMEM;
+
+ tc->next_free = 0;
+ store->context = tc;
+
+ return 0;
+}
--- diff/drivers/md/dm-ioctl.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-ioctl.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,1135 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/compatmac.h>
+#include <linux/miscdevice.h>
+#include <linux/dm-ioctl.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/blk.h>
+#include <linux/slab.h>
+
+#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
+
+/*-----------------------------------------------------------------
+ * The ioctl interface needs to be able to look up devices by
+ * name or uuid.
+ *---------------------------------------------------------------*/
+struct hash_cell {
+ struct list_head name_list;
+ struct list_head uuid_list;
+
+ char *name;
+ char *uuid;
+ struct mapped_device *md;
+
+ /* I hate devfs */
+ devfs_handle_t devfs_entry;
+};
+
+#define NUM_BUCKETS 64
+#define MASK_BUCKETS (NUM_BUCKETS - 1)
+static struct list_head _name_buckets[NUM_BUCKETS];
+static struct list_head _uuid_buckets[NUM_BUCKETS];
+
+static devfs_handle_t _dev_dir;
+void dm_hash_remove_all(void);
+
+/*
+ * Guards access to all three tables.
+ */
+static DECLARE_RWSEM(_hash_lock);
+
+static void init_buckets(struct list_head *buckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < NUM_BUCKETS; i++)
+ INIT_LIST_HEAD(buckets + i);
+}
+
+int dm_hash_init(void)
+{
+ init_buckets(_name_buckets);
+ init_buckets(_uuid_buckets);
+ _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
+ return 0;
+}
+
+void dm_hash_exit(void)
+{
+ dm_hash_remove_all();
+ devfs_unregister(_dev_dir);
+}
+
+/*-----------------------------------------------------------------
+ * Hash function:
+ * We're not really concerned with the str hash function being
+ * fast since it's only used by the ioctl interface.
+ *---------------------------------------------------------------*/
+static unsigned int hash_str(const char *str)
+{
+ const unsigned int hash_mult = 2654435387U;
+ unsigned int h = 0;
+
+ while (*str)
+ h = (h + (unsigned int) *str++) * hash_mult;
+
+ return h & MASK_BUCKETS;
+}
+
+/*-----------------------------------------------------------------
+ * Code for looking up a device by name
+ *---------------------------------------------------------------*/
+static struct hash_cell *__get_name_cell(const char *str)
+{
+ struct list_head *tmp;
+ struct hash_cell *hc;
+ unsigned int h = hash_str(str);
+
+ list_for_each (tmp, _name_buckets + h) {
+ hc = list_entry(tmp, struct hash_cell, name_list);
+ if (!strcmp(hc->name, str))
+ return hc;
+ }
+
+ return NULL;
+}
+
+static struct hash_cell *__get_uuid_cell(const char *str)
+{
+ struct list_head *tmp;
+ struct hash_cell *hc;
+ unsigned int h = hash_str(str);
+
+ list_for_each (tmp, _uuid_buckets + h) {
+ hc = list_entry(tmp, struct hash_cell, uuid_list);
+ if (!strcmp(hc->uuid, str))
+ return hc;
+ }
+
+ return NULL;
+}
+
+/*-----------------------------------------------------------------
+ * Inserting, removing and renaming a device.
+ *---------------------------------------------------------------*/
+static inline char *kstrdup(const char *str)
+{
+ char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
+ if (r)
+ strcpy(r, str);
+ return r;
+}
+
+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
+ struct mapped_device *md)
+{
+ struct hash_cell *hc;
+
+ hc = kmalloc(sizeof(*hc), GFP_KERNEL);
+ if (!hc)
+ return NULL;
+
+ hc->name = kstrdup(name);
+ if (!hc->name) {
+ kfree(hc);
+ return NULL;
+ }
+
+ if (!uuid)
+ hc->uuid = NULL;
+
+ else {
+ hc->uuid = kstrdup(uuid);
+ if (!hc->uuid) {
+ kfree(hc->name);
+ kfree(hc);
+ return NULL;
+ }
+ }
+
+ INIT_LIST_HEAD(&hc->name_list);
+ INIT_LIST_HEAD(&hc->uuid_list);
+ hc->md = md;
+ return hc;
+}
+
+static void free_cell(struct hash_cell *hc)
+{
+ if (hc) {
+ kfree(hc->name);
+ kfree(hc->uuid);
+ kfree(hc);
+ }
+}
+
+/*
+ * devfs stuff.
+ */
+static int register_with_devfs(struct hash_cell *hc)
+{
+ kdev_t dev = dm_kdev(hc->md);
+
+ hc->devfs_entry =
+ devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
+ major(dev), minor(dev),
+ S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
+ &dm_blk_dops, NULL);
+
+ return 0;
+}
+
+static int unregister_with_devfs(struct hash_cell *hc)
+{
+ devfs_unregister(hc->devfs_entry);
+ return 0;
+}
+
+/*
+ * The kdev_t and uuid of a device can never change once it is
+ * initially inserted.
+ */
+int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
+{
+ struct hash_cell *cell;
+
+ /*
+ * Allocate the new cells.
+ */
+ cell = alloc_cell(name, uuid, md);
+ if (!cell)
+ return -ENOMEM;
+
+ /*
+ * Insert the cell into all three hash tables.
+ */
+ down_write(&_hash_lock);
+ if (__get_name_cell(name))
+ goto bad;
+
+ list_add(&cell->name_list, _name_buckets + hash_str(name));
+
+ if (uuid) {
+ if (__get_uuid_cell(uuid)) {
+ list_del(&cell->name_list);
+ goto bad;
+ }
+ list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
+ }
+ register_with_devfs(cell);
+ dm_get(md);
+ up_write(&_hash_lock);
+
+ return 0;
+
+ bad:
+ up_write(&_hash_lock);
+ free_cell(cell);
+ return -EBUSY;
+}
+
+void __hash_remove(struct hash_cell *hc)
+{
+ /* remove from the dev hash */
+ list_del(&hc->uuid_list);
+ list_del(&hc->name_list);
+ unregister_with_devfs(hc);
+ dm_put(hc->md);
+}
+
+void dm_hash_remove_all(void)
+{
+ int i;
+ struct hash_cell *hc;
+ struct list_head *tmp, *n;
+
+ down_write(&_hash_lock);
+ for (i = 0; i < NUM_BUCKETS; i++) {
+ list_for_each_safe (tmp, n, _name_buckets + i) {
+ hc = list_entry(tmp, struct hash_cell, name_list);
+ __hash_remove(hc);
+ }
+ }
+ up_write(&_hash_lock);
+}
+
+int dm_hash_rename(const char *old, const char *new)
+{
+ char *new_name, *old_name;
+ struct hash_cell *hc;
+
+ /*
+ * duplicate new.
+ */
+ new_name = kstrdup(new);
+ if (!new_name)
+ return -ENOMEM;
+
+ down_write(&_hash_lock);
+
+ /*
+ * Is new free ?
+ */
+ hc = __get_name_cell(new);
+ if (hc) {
+ DMWARN("asked to rename to an already existing name %s -> %s",
+ old, new);
+ up_write(&_hash_lock);
+ return -EBUSY;
+ }
+
+ /*
+ * Is there such a device as 'old' ?
+ */
+ hc = __get_name_cell(old);
+ if (!hc) {
+ DMWARN("asked to rename a non existent device %s -> %s",
+ old, new);
+ up_write(&_hash_lock);
+ return -ENXIO;
+ }
+
+ /*
+ * rename and move the name cell.
+ */
+ list_del(&hc->name_list);
+ old_name = hc->name;
+ hc->name = new_name;
+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+ /* rename the device node in devfs */
+ unregister_with_devfs(hc);
+ register_with_devfs(hc);
+
+ up_write(&_hash_lock);
+ kfree(old_name);
+ return 0;
+}
+
+
+/*-----------------------------------------------------------------
+ * Implementation of the ioctl commands
+ *---------------------------------------------------------------*/
+
+/*
+ * All the ioctl commands get dispatched to functions with this
+ * prototype.
+ */
+typedef int (*ioctl_fn)(struct dm_ioctl *param, struct dm_ioctl *user);
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from userland.
+ */
+static int valid_str(char *str, void *begin, void *end)
+{
+ while (((void *) str >= begin) && ((void *) str < end))
+ if (!*str++)
+ return 0;
+
+ return -EINVAL;
+}
+
+static int next_target(struct dm_target_spec *last, uint32_t next,
+ void *begin, void *end,
+ struct dm_target_spec **spec, char **params)
+{
+ *spec = (struct dm_target_spec *)
+ ((unsigned char *) last + next);
+ *params = (char *) (*spec + 1);
+
+ if (*spec < (last + 1) || ((void *) *spec > end))
+ return -EINVAL;
+
+ return valid_str(*params, begin, end);
+}
+
+static int populate_table(struct dm_table *table, struct dm_ioctl *args)
+{
+ int i = 0, r, first = 1;
+ struct dm_target_spec *spec;
+ char *params;
+ void *begin, *end;
+
+ if (!args->target_count) {
+ DMWARN("populate_table: no targets specified");
+ return -EINVAL;
+ }
+
+ begin = (void *) args;
+ end = begin + args->data_size;
+
+ for (i = 0; i < args->target_count; i++) {
+
+ if (first)
+ r = next_target((struct dm_target_spec *) args,
+ args->data_start,
+ begin, end, &spec, ¶ms);
+ else
+ r = next_target(spec, spec->next, begin, end,
+ &spec, ¶ms);
+
+ if (r) {
+ DMWARN("unable to find target");
+ return -EINVAL;
+ }
+
+ r = dm_table_add_target(table, spec->target_type,
+ spec->sector_start, spec->length,
+ params);
+ if (r) {
+ DMWARN("internal error adding target to table");
+ return -EINVAL;
+ }
+
+ first = 0;
+ }
+
+ return dm_table_complete(table);
+}
+
+/*
+ * Round up the ptr to the next 'align' boundary. Obviously
+ * 'align' must be a power of 2.
+ */
+static inline void *align_ptr(void *ptr, unsigned int align)
+{
+ align--;
+ return (void *) (((unsigned long) (ptr + align)) & ~align);
+}
+
+/*
+ * Copies a dm_ioctl and an optional additional payload to
+ * userland.
+ */
+static int results_to_user(struct dm_ioctl *user, struct dm_ioctl *param,
+ void *data, uint32_t len)
+{
+ int r;
+ void *ptr = NULL;
+
+ if (data) {
+ ptr = align_ptr(user + 1, sizeof(unsigned long));
+ param->data_start = ptr - (void *) user;
+ }
+
+ /*
+ * The version number has already been filled in, so we
+ * just copy later fields.
+ */
+ r = copy_to_user(&user->data_size, ¶m->data_size,
+ sizeof(*param) - sizeof(param->version));
+ if (r)
+ return -EFAULT;
+
+ if (data) {
+ if (param->data_start + len > param->data_size)
+ return -ENOSPC;
+
+ if (copy_to_user(ptr, data, len))
+ r = -EFAULT;
+ }
+
+ return r;
+}
+
+/*
+ * Fills in a dm_ioctl structure, ready for sending back to
+ * userland.
+ */
+static int __info(struct mapped_device *md, struct dm_ioctl *param)
+{
+ kdev_t dev = dm_kdev(md);
+ struct dm_table *table;
+ struct block_device *bdev;
+
+ param->flags = DM_EXISTS_FLAG;
+ if (dm_suspended(md))
+ param->flags |= DM_SUSPEND_FLAG;
+
+ param->dev = kdev_t_to_nr(dev);
+ bdev = bdget(param->dev);
+ if (!bdev)
+ return -ENXIO;
+
+ param->open_count = bdev->bd_openers;
+ bdput(bdev);
+
+ if (is_read_only(dev))
+ param->flags |= DM_READONLY_FLAG;
+
+ table = dm_get_table(md);
+ param->target_count = dm_table_get_num_targets(table);
+ dm_table_put(table);
+
+ return 0;
+}
+
+/*
+ * Always use UUID for lookups if it's present, otherwise use name.
+ */
+static inline struct mapped_device *find_device(struct dm_ioctl *param)
+{
+ struct hash_cell *hc;
+ struct mapped_device *md = NULL;
+
+ down_read(&_hash_lock);
+ hc = *param->uuid ? __get_uuid_cell(param->uuid) :
+ __get_name_cell(param->name);
+ if (hc) {
+ md = hc->md;
+
+ /*
+ * Sneakily write in both the name and the uuid
+ * while we have the cell.
+ */
+ strncpy(param->name, hc->name, sizeof(param->name));
+ if (hc->uuid)
+ strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
+ else
+ param->uuid[0] = '\0';
+
+ dm_get(md);
+ }
+ up_read(&_hash_lock);
+
+ return md;
+}
+
+#define ALIGNMENT sizeof(int)
+static void *_align(void *ptr, unsigned int a)
+{
+ register unsigned long align = --a;
+
+ return (void *) (((unsigned long) ptr + align) & ~align);
+}
+
+/*
+ * Copies device info back to user space, used by
+ * the create and info ioctls.
+ */
+static int info(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ struct mapped_device *md;
+
+ param->flags = 0;
+
+ md = find_device(param);
+ if (!md)
+ /*
+ * Device not found - returns cleared exists flag.
+ */
+ goto out;
+
+ __info(md, param);
+ dm_put(md);
+
+ out:
+ return results_to_user(user, param, NULL, 0);
+}
+
+static inline int get_mode(struct dm_ioctl *param)
+{
+ int mode = FMODE_READ | FMODE_WRITE;
+
+ if (param->flags & DM_READONLY_FLAG)
+ mode = FMODE_READ;
+
+ return mode;
+}
+
+static int check_name(const char *name)
+{
+ if (strchr(name, '/')) {
+ DMWARN("invalid device name");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int create(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ int r;
+ kdev_t dev;
+ struct dm_table *t;
+ struct mapped_device *md;
+ int minor;
+
+ r = check_name(param->name);
+ if (r)
+ return r;
+
+ r = dm_table_create(&t, get_mode(param));
+ if (r)
+ return r;
+
+ r = populate_table(t, param);
+ if (r) {
+ dm_table_put(t);
+ return r;
+ }
+
+ minor = (param->flags & DM_PERSISTENT_DEV_FLAG) ?
+ minor(to_kdev_t(param->dev)) : -1;
+
+ r = dm_create(minor, t, &md);
+ if (r) {
+ dm_table_put(t);
+ return r;
+ }
+ dm_table_put(t); /* md will have grabbed its own reference */
+
+ dev = dm_kdev(md);
+ set_device_ro(dev, (param->flags & DM_READONLY_FLAG));
+ r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
+ dm_put(md);
+
+ return r ? r : info(param, user);
+}
+
+/*
+ * Build up the status struct for each target
+ */
+static int __status(struct mapped_device *md, struct dm_ioctl *param,
+ char *outbuf, int *len)
+{
+ int i, num_targets;
+ struct dm_target_spec *spec;
+ char *outptr;
+ status_type_t type;
+ struct dm_table *table = dm_get_table(md);
+
+ if (param->flags & DM_STATUS_TABLE_FLAG)
+ type = STATUSTYPE_TABLE;
+ else
+ type = STATUSTYPE_INFO;
+
+ outptr = outbuf;
+
+ /* Get all the target info */
+ num_targets = dm_table_get_num_targets(table);
+ for (i = 0; i < num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(table, i);
+
+ if (outptr - outbuf +
+ sizeof(struct dm_target_spec) > param->data_size) {
+ dm_table_put(table);
+ return -ENOMEM;
+ }
+
+ spec = (struct dm_target_spec *) outptr;
+
+ spec->status = 0;
+ spec->sector_start = ti->begin;
+ spec->length = ti->len;
+ strncpy(spec->target_type, ti->type->name,
+ sizeof(spec->target_type));
+
+ outptr += sizeof(struct dm_target_spec);
+
+ /* Get the status/table string from the target driver */
+ if (ti->type->status)
+ ti->type->status(ti, type, outptr,
+ outbuf + param->data_size - outptr);
+ else
+ outptr[0] = '\0';
+
+ outptr += strlen(outptr) + 1;
+ _align(outptr, ALIGNMENT);
+ spec->next = outptr - outbuf;
+ }
+
+ param->target_count = num_targets;
+ *len = outptr - outbuf;
+ dm_table_put(table);
+
+ return 0;
+}
+
+/*
+ * Return the status of a device as a text string for each
+ * target.
+ */
+static int get_status(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ struct mapped_device *md;
+ int len = 0;
+ int ret;
+ char *outbuf = NULL;
+
+ md = find_device(param);
+ if (!md)
+ /*
+ * Device not found - returns cleared exists flag.
+ */
+ goto out;
+
+ /* We haven't a clue how long the resultant data will be so
+ just allocate as much as userland has allowed us and make sure
+ we don't overun it */
+ outbuf = kmalloc(param->data_size, GFP_KERNEL);
+ if (!outbuf)
+ goto out;
+ /*
+ * Get the status of all targets
+ */
+ __status(md, param, outbuf, &len);
+
+ /*
+ * Setup the basic dm_ioctl structure.
+ */
+ __info(md, param);
+
+ out:
+ if (md)
+ dm_put(md);
+
+ ret = results_to_user(user, param, outbuf, len);
+
+ if (outbuf)
+ kfree(outbuf);
+
+ return ret;
+}
+
+/*
+ * Wait for a device to report an event
+ */
+static int wait_device_event(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ struct mapped_device *md;
+ struct dm_table *table;
+ DECLARE_WAITQUEUE(wq, current);
+
+ md = find_device(param);
+ if (!md)
+ /*
+ * Device not found - returns cleared exists flag.
+ */
+ goto out;
+
+ /*
+ * Setup the basic dm_ioctl structure.
+ */
+ __info(md, param);
+
+ /*
+ * Wait for a notification event
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ table = dm_get_table(md);
+ dm_table_add_wait_queue(table, &wq);
+ dm_table_put(table);
+ dm_put(md);
+
+ yield();
+ set_current_state(TASK_RUNNING);
+
+ out:
+ return results_to_user(user, param, NULL, 0);
+}
+
+/*
+ * Retrieves a list of devices used by a particular dm device.
+ */
+static int dep(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ int count, r;
+ struct mapped_device *md;
+ struct list_head *tmp;
+ size_t len = 0;
+ struct dm_target_deps *deps = NULL;
+ struct dm_table *table;
+
+ md = find_device(param);
+ if (!md)
+ goto out;
+ table = dm_get_table(md);
+
+ /*
+ * Setup the basic dm_ioctl structure.
+ */
+ __info(md, param);
+
+ /*
+ * Count the devices.
+ */
+ count = 0;
+ list_for_each(tmp, dm_table_get_devices(table))
+ count++;
+
+ /*
+ * Allocate a kernel space version of the dm_target_status
+ * struct.
+ */
+ if (array_too_big(sizeof(*deps), sizeof(*deps->dev), count)) {
+ dm_table_put(table);
+ dm_put(md);
+ return -ENOMEM;
+ }
+
+ len = sizeof(*deps) + (sizeof(*deps->dev) * count);
+ deps = kmalloc(len, GFP_KERNEL);
+ if (!deps) {
+ dm_table_put(table);
+ dm_put(md);
+ return -ENOMEM;
+ }
+
+ /*
+ * Fill in the devices.
+ */
+ deps->count = count;
+ count = 0;
+ list_for_each(tmp, dm_table_get_devices(table)) {
+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+ deps->dev[count++] = dd->bdev->bd_dev;
+ }
+ dm_table_put(table);
+ dm_put(md);
+
+ out:
+ r = results_to_user(user, param, deps, len);
+
+ kfree(deps);
+ return r;
+}
+
+static int remove(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ struct hash_cell *hc;
+
+ down_write(&_hash_lock);
+ hc = *param->uuid ? __get_uuid_cell(param->uuid) :
+ __get_name_cell(param->name);
+ if (!hc) {
+ DMWARN("device doesn't appear to be in the dev hash table.");
+ up_write(&_hash_lock);
+ return -EINVAL;
+ }
+
+ __hash_remove(hc);
+ up_write(&_hash_lock);
+ return 0;
+}
+
+static int remove_all(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ dm_hash_remove_all();
+ return 0;
+}
+
+static int suspend(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ int r;
+ struct mapped_device *md;
+
+ md = find_device(param);
+ if (!md)
+ return -ENXIO;
+
+ if (param->flags & DM_SUSPEND_FLAG)
+ r = dm_suspend(md);
+ else
+ r = dm_resume(md);
+
+ dm_put(md);
+ return r;
+}
+
+static int reload(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ int r;
+ kdev_t dev;
+ struct mapped_device *md;
+ struct dm_table *t;
+
+ r = dm_table_create(&t, get_mode(param));
+ if (r)
+ return r;
+
+ r = populate_table(t, param);
+ if (r) {
+ dm_table_put(t);
+ return r;
+ }
+
+ md = find_device(param);
+ if (!md) {
+ dm_table_put(t);
+ return -ENXIO;
+ }
+
+ r = dm_swap_table(md, t);
+ if (r) {
+ dm_put(md);
+ dm_table_put(t);
+ return r;
+ }
+
+ dev = dm_kdev(md);
+ set_device_ro(dev, (param->flags & DM_READONLY_FLAG));
+ dm_put(md);
+
+ r = info(param, user);
+ return r;
+}
+
+static int rename(struct dm_ioctl *param, struct dm_ioctl *user)
+{
+ int r;
+ char *new_name = (char *) param + param->data_start;
+
+ if (valid_str(new_name, (void *) param,
+ (void *) param + param->data_size)) {
+ DMWARN("Invalid new logical volume name supplied.");
+ return -EINVAL;
+ }
+
+ r = check_name(new_name);
+ if (r)
+ return r;
+
+ return dm_hash_rename(param->name, new_name);
+}
+
+
+/*-----------------------------------------------------------------
+ * Implementation of open/close/ioctl on the special char
+ * device.
+ *---------------------------------------------------------------*/
+static ioctl_fn lookup_ioctl(unsigned int cmd)
+{
+ static struct {
+ int cmd;
+ ioctl_fn fn;
+ } _ioctls[] = {
+ {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
+ {DM_REMOVE_ALL_CMD, remove_all},
+ {DM_DEV_CREATE_CMD, create},
+ {DM_DEV_REMOVE_CMD, remove},
+ {DM_DEV_RELOAD_CMD, reload},
+ {DM_DEV_RENAME_CMD, rename},
+ {DM_DEV_SUSPEND_CMD, suspend},
+ {DM_DEV_DEPS_CMD, dep},
+ {DM_DEV_STATUS_CMD, info},
+ {DM_TARGET_STATUS_CMD, get_status},
+ {DM_TARGET_WAIT_CMD, wait_device_event},
+ };
+
+ return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
+}
+
+/*
+ * As well as checking the version compatibility this always
+ * copies the kernel interface version out.
+ */
+static int check_version(int cmd, struct dm_ioctl *user)
+{
+ uint32_t version[3];
+ int r = 0;
+
+ if (copy_from_user(version, user->version, sizeof(version)))
+ return -EFAULT;
+
+ if ((DM_VERSION_MAJOR != version[0]) ||
+ (DM_VERSION_MINOR < version[1])) {
+ DMWARN("ioctl interface mismatch: "
+ "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
+ DM_VERSION_MAJOR, DM_VERSION_MINOR,
+ DM_VERSION_PATCHLEVEL,
+ version[0], version[1], version[2], cmd);
+ r = -EINVAL;
+ }
+
+ /*
+ * Fill in the kernel version.
+ */
+ version[0] = DM_VERSION_MAJOR;
+ version[1] = DM_VERSION_MINOR;
+ version[2] = DM_VERSION_PATCHLEVEL;
+ if (copy_to_user(user->version, version, sizeof(version)))
+ return -EFAULT;
+
+ return r;
+}
+
+static void free_params(struct dm_ioctl *param)
+{
+ vfree(param);
+}
+
+static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
+{
+ struct dm_ioctl tmp, *dmi;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.data_size < sizeof(tmp))
+ return -EINVAL;
+
+ dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
+ if (!dmi)
+ return -ENOMEM;
+
+ if (copy_from_user(dmi, user, tmp.data_size)) {
+ vfree(dmi);
+ return -EFAULT;
+ }
+
+ *param = dmi;
+ return 0;
+}
+
+static int validate_params(uint cmd, struct dm_ioctl *param)
+{
+ /* Unless creating, either name of uuid but not both */
+ if (cmd != DM_DEV_CREATE_CMD) {
+ if ((!*param->uuid && !*param->name) ||
+ (*param->uuid && *param->name)) {
+ DMWARN("one of name or uuid must be supplied");
+ return -EINVAL;
+ }
+ }
+
+ /* Ensure strings are terminated */
+ param->name[DM_NAME_LEN - 1] = '\0';
+ param->uuid[DM_UUID_LEN - 1] = '\0';
+
+ return 0;
+}
+
+static int ctl_ioctl(struct inode *inode, struct file *file,
+ uint command, ulong u)
+{
+ int r = 0, cmd;
+ struct dm_ioctl *param;
+ struct dm_ioctl *user = (struct dm_ioctl *) u;
+ ioctl_fn fn = NULL;
+
+ /* only root can play with this */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (_IOC_TYPE(command) != DM_IOCTL)
+ return -ENOTTY;
+
+ cmd = _IOC_NR(command);
+
+ /*
+ * Check the interface version passed in. This also
+ * writes out the kernels interface version.
+ */
+ r = check_version(cmd, user);
+ if (r)
+ return r;
+
+ /*
+ * Nothing more to do for the version command.
+ */
+ if (cmd == DM_VERSION_CMD)
+ return 0;
+
+ fn = lookup_ioctl(cmd);
+ if (!fn) {
+ DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
+ return -ENOTTY;
+ }
+
+ /*
+ * Copy the parameters into kernel space.
+ */
+ r = copy_params(user, ¶m);
+ if (r)
+ return r;
+
+ r = validate_params(cmd, param);
+ if (r) {
+ free_params(param);
+ return r;
+ }
+
+ r = fn(param, user);
+ free_params(param);
+ return r;
+}
+
+static struct file_operations _ctl_fops = {
+ .ioctl = ctl_ioctl,
+ .owner = THIS_MODULE,
+};
+
+static devfs_handle_t _ctl_handle;
+
+static struct miscdevice _dm_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = DM_NAME,
+ .fops = &_ctl_fops
+};
+
+/*
+ * Create misc character device and link to DM_DIR/control.
+ */
+int __init dm_interface_init(void)
+{
+ int r;
+ char rname[64];
+
+ r = dm_hash_init();
+ if (r)
+ return r;
+
+ r = misc_register(&_dm_misc);
+ if (r) {
+ DMERR("misc_register failed for control device");
+ dm_hash_exit();
+ return r;
+ }
+
+ r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
+ sizeof rname - 3);
+ if (r == -ENOSYS)
+ return 0; /* devfs not present */
+
+ if (r < 0) {
+ DMERR("devfs_generate_path failed for control device");
+ goto failed;
+ }
+
+ strncpy(rname + r, "../", 3);
+ r = devfs_mk_symlink(NULL, DM_DIR "/control",
+ DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
+ if (r) {
+ DMERR("devfs_mk_symlink failed for control device");
+ goto failed;
+ }
+ devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
+
+ DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
+ DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
+ DM_DRIVER_EMAIL);
+ return 0;
+
+ failed:
+ dm_hash_exit();
+ misc_deregister(&_dm_misc);
+ return r;
+}
+
+void dm_interface_exit(void)
+{
+ dm_hash_exit();
+
+ if (misc_deregister(&_dm_misc) < 0)
+ DMERR("misc_deregister failed for control device");
+}
--- diff/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-linear.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+/*
+ * Linear: maps a linear range of a device.
+ */
+struct linear_c {
+ struct dm_dev *dev;
+ sector_t start;
+};
+
+/*
+ * Construct a linear mapping: <dev_path> <offset>
+ */
+static int linear_ctr(struct dm_target *ti, int argc, char **argv)
+{
+ struct linear_c *lc;
+
+ if (argc != 2) {
+ ti->error = "dm-linear: Not enough arguments";
+ return -EINVAL;
+ }
+
+ lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+ if (lc == NULL) {
+ ti->error = "dm-linear: Cannot allocate linear context";
+ return -ENOMEM;
+ }
+
+ if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
+ ti->error = "dm-linear: Invalid device sector";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[0], lc->start, ti->len,
+ dm_table_get_mode(ti->table), &lc->dev)) {
+ ti->error = "dm-linear: Device lookup failed";
+ goto bad;
+ }
+
+ ti->private = lc;
+ return 0;
+
+ bad:
+ kfree(lc);
+ return -EINVAL;
+}
+
+static void linear_dtr(struct dm_target *ti)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+
+ dm_put_device(ti, lc->dev);
+ kfree(lc);
+}
+
+static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+
+ bh->b_rdev = lc->dev->dev;
+ bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
+
+ return 1;
+}
+
+static int linear_status(struct dm_target *ti, status_type_t type,
+ char *result, int maxlen)
+{
+ struct linear_c *lc = (struct linear_c *) ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, "%s " SECTOR_FORMAT,
+ kdevname(to_kdev_t(lc->dev->bdev->bd_dev)), lc->start);
+ break;
+ }
+ return 0;
+}
+
+static struct target_type linear_target = {
+ .name = "linear",
+ .module = THIS_MODULE,
+ .ctr = linear_ctr,
+ .dtr = linear_dtr,
+ .map = linear_map,
+ .status = linear_status,
+};
+
+int __init dm_linear_init(void)
+{
+ int r = dm_register_target(&linear_target);
+
+ if (r < 0)
+ DMERR("linear: register failed %d", r);
+
+ return r;
+}
+
+void dm_linear_exit(void)
+{
+ int r = dm_unregister_target(&linear_target);
+
+ if (r < 0)
+ DMERR("linear: unregister failed %d", r);
+}
--- diff/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-snapshot.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,1167 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/device-mapper.h>
+#include <linux/vmalloc.h>
+
+#include "dm-snapshot.h"
+#include "kcopyd.h"
+
+/*
+ * FIXME: Remove this before release.
+ */
+#if 0
+#define DMDEBUG(x...) DMWARN( ## x)
+#else
+#define DMDEBUG(x...)
+#endif
+
+/*
+ * The percentage increment we will wake up users at
+ */
+#define WAKE_UP_PERCENT 5
+
+/*
+ * Hard sector size used all over the kernel
+ */
+#define SECTOR_SIZE 512
+
+/*
+ * kcopyd priority of snapshot operations
+ */
+#define SNAPSHOT_COPY_PRIORITY 2
+
+struct pending_exception {
+ struct exception e;
+
+ /*
+ * Origin buffers waiting for this to complete are held
+ * in a list (using b_reqnext).
+ */
+ struct buffer_head *origin_bhs;
+ struct buffer_head *snapshot_bhs;
+
+ /*
+ * Other pending_exceptions that are processing this
+ * chunk. When this list is empty, we know we can
+ * complete the origins.
+ */
+ struct list_head siblings;
+
+ /* Pointer back to snapshot context */
+ struct dm_snapshot *snap;
+
+ /*
+ * 1 indicates the exception has already been sent to
+ * kcopyd.
+ */
+ int started;
+};
+
+/*
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
+ */
+static kmem_cache_t *exception_cache;
+static kmem_cache_t *pending_cache;
+static mempool_t *pending_pool;
+
+/*
+ * One of these per registered origin, held in the snapshot_origins hash
+ */
+struct origin {
+ /* The origin device */
+ kdev_t dev;
+
+ struct list_head hash_list;
+
+ /* List of snapshots for this origin */
+ struct list_head snapshots;
+};
+
+/*
+ * Size of the hash table for origin volumes. If we make this
+ * the size of the minors list then it should be nearly perfect
+ */
+#define ORIGIN_HASH_SIZE 256
+#define ORIGIN_MASK 0xFF
+static struct list_head *_origins;
+static struct rw_semaphore _origins_lock;
+
+static int init_origin_hash(void)
+{
+ int i;
+
+ _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!_origins) {
+ DMERR("Device mapper: Snapshot: unable to allocate memory");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
+ INIT_LIST_HEAD(_origins + i);
+ init_rwsem(&_origins_lock);
+
+ return 0;
+}
+
+static void exit_origin_hash(void)
+{
+ kfree(_origins);
+}
+
+static inline unsigned int origin_hash(kdev_t dev)
+{
+ return MINOR(dev) & ORIGIN_MASK;
+}
+
+static struct origin *__lookup_origin(kdev_t origin)
+{
+ struct list_head *slist;
+ struct list_head *ol;
+ struct origin *o;
+
+ ol = &_origins[origin_hash(origin)];
+ list_for_each(slist, ol) {
+ o = list_entry(slist, struct origin, hash_list);
+
+ if (o->dev == origin)
+ return o;
+ }
+
+ return NULL;
+}
+
+static void __insert_origin(struct origin *o)
+{
+ struct list_head *sl = &_origins[origin_hash(o->dev)];
+ list_add_tail(&o->hash_list, sl);
+}
+
+/*
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
+ */
+static int register_snapshot(struct dm_snapshot *snap)
+{
+ struct origin *o;
+ kdev_t dev = snap->origin->dev;
+
+ down_write(&_origins_lock);
+ o = __lookup_origin(dev);
+
+ if (!o) {
+ /* New origin */
+ o = kmalloc(sizeof(*o), GFP_KERNEL);
+ if (!o) {
+ up_write(&_origins_lock);
+ return -ENOMEM;
+ }
+
+ /* Initialise the struct */
+ INIT_LIST_HEAD(&o->snapshots);
+ o->dev = dev;
+
+ __insert_origin(o);
+ }
+
+ list_add_tail(&snap->list, &o->snapshots);
+
+ up_write(&_origins_lock);
+ return 0;
+}
+
+static void unregister_snapshot(struct dm_snapshot *s)
+{
+ struct origin *o;
+
+ down_write(&_origins_lock);
+ o = __lookup_origin(s->origin->dev);
+
+ list_del(&s->list);
+ if (list_empty(&o->snapshots)) {
+ list_del(&o->hash_list);
+ kfree(o);
+ }
+
+ up_write(&_origins_lock);
+}
+
+/*
+ * Implementation of the exception hash tables.
+ */
+static int init_exception_table(struct exception_table *et, uint32_t size)
+{
+ int i;
+
+ et->hash_mask = size - 1;
+ et->table = vcalloc(size, sizeof(struct list_head));
+ if (!et->table)
+ return -ENOMEM;
+
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(et->table + i);
+
+ return 0;
+}
+
+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+{
+ struct list_head *slot, *entry, *temp;
+ struct exception *ex;
+ int i, size;
+
+ size = et->hash_mask + 1;
+ for (i = 0; i < size; i++) {
+ slot = et->table + i;
+
+ list_for_each_safe(entry, temp, slot) {
+ ex = list_entry(entry, struct exception, hash_list);
+ kmem_cache_free(mem, ex);
+ }
+ }
+
+ vfree(et->table);
+}
+
+/*
+ * FIXME: check how this hash fn is performing.
+ */
+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
+{
+ return chunk & et->hash_mask;
+}
+
+static void insert_exception(struct exception_table *eh, struct exception *e)
+{
+ struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
+ list_add(&e->hash_list, l);
+}
+
+static inline void remove_exception(struct exception *e)
+{
+ list_del(&e->hash_list);
+}
+
+/*
+ * Return the exception data for a sector, or NULL if not
+ * remapped.
+ */
+static struct exception *lookup_exception(struct exception_table *et,
+ chunk_t chunk)
+{
+ struct list_head *slot, *el;
+ struct exception *e;
+
+ slot = &et->table[exception_hash(et, chunk)];
+ list_for_each(el, slot) {
+ e = list_entry(el, struct exception, hash_list);
+ if (e->old_chunk == chunk)
+ return e;
+ }
+
+ return NULL;
+}
+
+static inline struct exception *alloc_exception(void)
+{
+ struct exception *e;
+
+ e = kmem_cache_alloc(exception_cache, GFP_NOIO);
+ if (!e)
+ e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
+
+ return e;
+}
+
+static inline void free_exception(struct exception *e)
+{
+ kmem_cache_free(exception_cache, e);
+}
+
+static inline struct pending_exception *alloc_pending_exception(void)
+{
+ return mempool_alloc(pending_pool, GFP_NOIO);
+}
+
+static inline void free_pending_exception(struct pending_exception *pe)
+{
+ mempool_free(pe, pending_pool);
+}
+
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+{
+ struct exception *e;
+
+ e = alloc_exception();
+ if (!e)
+ return -ENOMEM;
+
+ e->old_chunk = old;
+ e->new_chunk = new;
+ insert_exception(&s->complete, e);
+ return 0;
+}
+
+/*
+ * Hard coded magic.
+ */
+static int calc_max_buckets(void)
+{
+ unsigned long mem;
+
+ mem = num_physpages << PAGE_SHIFT;
+ mem /= 50;
+ mem /= sizeof(struct list_head);
+
+ return mem;
+}
+
+/*
+ * Rounds a number down to a power of 2.
+ */
+static inline uint32_t round_down(uint32_t n)
+{
+ while (n & (n - 1))
+ n &= (n - 1);
+ return n;
+}
+
+/*
+ * Allocate room for a suitable hash table.
+ */
+static int init_hash_tables(struct dm_snapshot *s)
+{
+ sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+
+ /*
+ * Calculate based on the size of the original volume or
+ * the COW volume...
+ */
+ cow_dev_size = get_dev_size(s->cow->dev);
+ origin_dev_size = get_dev_size(s->origin->dev);
+ max_buckets = calc_max_buckets();
+
+ hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
+ hash_size = min(hash_size, max_buckets);
+
+ /* Round it down to a power of 2 */
+ hash_size = round_down(hash_size);
+ if (init_exception_table(&s->complete, hash_size))
+ return -ENOMEM;
+
+ /*
+ * Allocate hash table for in-flight exceptions
+ * Make this smaller than the real hash table
+ */
+ hash_size >>= 3;
+ if (!hash_size)
+ hash_size = 64;
+
+ if (init_exception_table(&s->pending, hash_size)) {
+ exit_exception_table(&s->complete, exception_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Round a number up to the nearest 'size' boundary. size must
+ * be a power of 2.
+ */
+static inline ulong round_up(ulong n, ulong size)
+{
+ size--;
+ return (n + size) & ~size;
+}
+
+/*
+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
+ */
+static int snapshot_ctr(struct dm_target *ti, int argc, char **argv)
+{
+ struct dm_snapshot *s;
+ unsigned long chunk_size;
+ int r = -EINVAL;
+ char *persistent;
+ char *origin_path;
+ char *cow_path;
+ char *value;
+ int blocksize;
+
+ if (argc < 4) {
+ ti->error = "dm-snapshot: requires exactly 4 arguments";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ origin_path = argv[0];
+ cow_path = argv[1];
+ persistent = argv[2];
+
+ if ((*persistent & 0x5f) != 'P' && (*persistent & 0x5f) != 'N') {
+ ti->error = "Persistent flag is not P or N";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ chunk_size = simple_strtoul(argv[3], &value, 10);
+ if (chunk_size == 0 || value == NULL) {
+ ti->error = "Invalid chunk size";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (s == NULL) {
+ ti->error = "Cannot allocate snapshot context private structure";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
+ if (r) {
+ ti->error = "Cannot get origin device";
+ goto bad_free;
+ }
+
+ /* FIXME: get cow length */
+ r = dm_get_device(ti, cow_path, 0, 0,
+ FMODE_READ | FMODE_WRITE, &s->cow);
+ if (r) {
+ dm_put_device(ti, s->origin);
+ ti->error = "Cannot get COW device";
+ goto bad_free;
+ }
+
+ /*
+ * Chunk size must be multiple of page size. Silently
+ * round up if it's not.
+ */
+ chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
+
+ /* Validate the chunk size against the device block size */
+ blocksize = get_hardsect_size(s->cow->dev);
+ if (chunk_size % (blocksize / SECTOR_SIZE)) {
+ ti->error = "Chunk size is not a multiple of device blocksize";
+ r = -EINVAL;
+ goto bad_putdev;
+ }
+
+ /* Check the sizes are small enough to fit in one kiovec */
+ if (chunk_size > KIO_MAX_SECTORS) {
+ ti->error = "Chunk size is too big";
+ r = -EINVAL;
+ goto bad_putdev;
+ }
+
+ /* Check chunk_size is a power of 2 */
+ if (chunk_size & (chunk_size - 1)) {
+ ti->error = "Chunk size is not a power of 2";
+ r = -EINVAL;
+ goto bad_putdev;
+ }
+
+ s->chunk_size = chunk_size;
+ s->chunk_mask = chunk_size - 1;
+ s->type = *persistent;
+ for (s->chunk_shift = 0; chunk_size;
+ s->chunk_shift++, chunk_size >>= 1)
+ ;
+ s->chunk_shift--;
+
+ s->valid = 1;
+ s->last_percent = 0;
+ init_rwsem(&s->lock);
+
+ /* Allocate hash table for COW data */
+ if (init_hash_tables(s)) {
+ ti->error = "Unable to allocate hash table space";
+ r = -ENOMEM;
+ goto bad_putdev;
+ }
+
+ /*
+ * Check the persistent flag - done here because we need the iobuf
+ * to check the LV header
+ */
+ s->store.snap = s;
+
+ if ((*persistent & 0x5f) == 'P')
+ r = dm_create_persistent(&s->store, s->chunk_size);
+ else
+ r = dm_create_transient(&s->store, s, blocksize);
+
+ if (r) {
+ ti->error = "Couldn't create exception store";
+ r = -EINVAL;
+ goto bad_free1;
+ }
+
+ /* Flush IO to the origin device */
+#if LVM_VFS_ENHANCEMENT
+ fsync_dev_lockfs(s->origin->dev);
+#else
+ fsync_dev(s->origin->dev);
+#endif
+
+ /* Add snapshot to the list of snapshots for this origin */
+ if (register_snapshot(s)) {
+ r = -EINVAL;
+ ti->error = "Cannot register snapshot origin";
+ goto bad_free2;
+ }
+#if LVM_VFS_ENHANCEMENT
+ unlockfs(s->origin->dev);
+#endif
+ kcopyd_inc_client_count();
+
+ ti->private = s;
+ return 0;
+
+ bad_free2:
+ s->store.destroy(&s->store);
+
+ bad_free1:
+ exit_exception_table(&s->pending, pending_cache);
+ exit_exception_table(&s->complete, exception_cache);
+
+ bad_putdev:
+ dm_put_device(ti, s->cow);
+ dm_put_device(ti, s->origin);
+
+ bad_free:
+ kfree(s);
+
+ bad:
+ return r;
+}
+
+static void snapshot_dtr(struct dm_target *ti)
+{
+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+
+ dm_table_event(ti->table);
+
+ unregister_snapshot(s);
+
+ exit_exception_table(&s->pending, pending_cache);
+ exit_exception_table(&s->complete, exception_cache);
+
+ /* Deallocate memory used */
+ s->store.destroy(&s->store);
+
+ dm_put_device(ti, s->origin);
+ dm_put_device(ti, s->cow);
+ kfree(s);
+
+ kcopyd_dec_client_count();
+}
+
+/*
+ * We hold lists of buffer_heads, using the b_reqnext field.
+ */
+static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
+{
+ bh->b_reqnext = *queue;
+ *queue = bh;
+}
+
+/*
+ * Flush a list of buffers.
+ */
+static void flush_buffers(struct buffer_head *bh)
+{
+ struct buffer_head *n;
+
+ DMDEBUG("begin flush");
+ while (bh) {
+ n = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+ DMDEBUG("flushing %p", bh);
+ generic_make_request(WRITE, bh);
+ bh = n;
+ }
+
+ run_task_queue(&tq_disk);
+}
+
+/*
+ * Error a list of buffers.
+ */
+static void error_buffers(struct buffer_head *bh)
+{
+ struct buffer_head *n;
+
+ while (bh) {
+ n = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+ buffer_IO_error(bh);
+ bh = n;
+ }
+}
+
+static void pending_complete(struct pending_exception *pe, int success)
+{
+ struct exception *e;
+ struct dm_snapshot *s = pe->snap;
+
+ if (success) {
+ e = alloc_exception();
+ if (!e) {
+ printk("Unable to allocate exception.");
+ down_write(&s->lock);
+ s->store.drop_snapshot(&s->store);
+ s->valid = 0;
+ up_write(&s->lock);
+ return;
+ }
+
+ /*
+ * Add a proper exception, and remove the
+ * inflight exception from the list.
+ */
+ down_write(&s->lock);
+
+ memcpy(e, &pe->e, sizeof(*e));
+ insert_exception(&s->complete, e);
+ remove_exception(&pe->e);
+
+ /* Submit any pending write BHs */
+ up_write(&s->lock);
+
+ flush_buffers(pe->snapshot_bhs);
+ DMDEBUG("Exception completed successfully.");
+
+ /* Notify any interested parties */
+ if (s->store.percent_full) {
+ int pc = s->store.percent_full(&s->store);
+
+ if (pc >= s->last_percent + WAKE_UP_PERCENT) {
+ dm_table_event(s->table);
+ s->last_percent = pc - pc % WAKE_UP_PERCENT;
+ }
+ }
+
+ } else {
+ /* Read/write error - snapshot is unusable */
+ DMERR("Error reading/writing snapshot");
+
+ down_write(&s->lock);
+ s->store.drop_snapshot(&s->store);
+ s->valid = 0;
+ remove_exception(&pe->e);
+ up_write(&s->lock);
+
+ error_buffers(pe->snapshot_bhs);
+
+ dm_table_event(s->table);
+ DMDEBUG("Exception failed.");
+ }
+
+ if (list_empty(&pe->siblings))
+ flush_buffers(pe->origin_bhs);
+ else
+ list_del(&pe->siblings);
+
+ free_pending_exception(pe);
+}
+
+static void commit_callback(void *context, int success)
+{
+ struct pending_exception *pe = (struct pending_exception *) context;
+ pending_complete(pe, success);
+}
+
+/*
+ * Called when the copy I/O has finished. kcopyd actually runs
+ * this code so don't block.
+ */
+static void copy_callback(int err, void *context)
+{
+ struct pending_exception *pe = (struct pending_exception *) context;
+ struct dm_snapshot *s = pe->snap;
+
+ if (err)
+ pending_complete(pe, 0);
+
+ else
+ /* Update the metadata if we are persistent */
+ s->store.commit_exception(&s->store, &pe->e, commit_callback,
+ pe);
+}
+
+/*
+ * Dispatches the copy operation to kcopyd.
+ */
+static inline void start_copy(struct pending_exception *pe)
+{
+ struct dm_snapshot *s = pe->snap;
+ struct kcopyd_region src, dest;
+
+ src.dev = s->origin->dev;
+ src.sector = chunk_to_sector(s, pe->e.old_chunk);
+ src.count = s->chunk_size;
+
+ dest.dev = s->cow->dev;
+ dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+ dest.count = s->chunk_size;
+
+ if (!pe->started) {
+ /* Hand over to kcopyd */
+ kcopyd_copy(&src, &dest, copy_callback, pe);
+ pe->started = 1;
+ }
+}
+
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ */
+static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
+ struct buffer_head *bh)
+{
+ struct exception *e;
+ struct pending_exception *pe;
+ chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
+
+ /*
+ * Is there a pending exception for this already ?
+ */
+ e = lookup_exception(&s->pending, chunk);
+ if (e) {
+ /* cast the exception to a pending exception */
+ pe = list_entry(e, struct pending_exception, e);
+
+ } else {
+ /* Create a new pending exception */
+ pe = alloc_pending_exception();
+ if (!pe) {
+ DMWARN("Couldn't allocate pending exception.");
+ return NULL;
+ }
+
+ pe->e.old_chunk = chunk;
+ pe->origin_bhs = pe->snapshot_bhs = NULL;
+ INIT_LIST_HEAD(&pe->siblings);
+ pe->snap = s;
+ pe->started = 0;
+
+ if (s->store.prepare_exception(&s->store, &pe->e)) {
+ free_pending_exception(pe);
+ s->valid = 0;
+ return NULL;
+ }
+
+ insert_exception(&s->pending, &pe->e);
+ }
+
+ return pe;
+}
+
+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
+ struct buffer_head *bh)
+{
+ bh->b_rdev = s->cow->dev;
+ bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
+ (bh->b_rsector & s->chunk_mask);
+}
+
+static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw)
+{
+ struct exception *e;
+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+ int r = 1;
+ chunk_t chunk;
+ struct pending_exception *pe;
+
+ chunk = sector_to_chunk(s, bh->b_rsector);
+
+ /* Full snapshots are not usable */
+ if (!s->valid)
+ return -1;
+
+ /*
+ * Write to snapshot - higher level takes care of RW/RO
+ * flags so we should only get this if we are
+ * writeable.
+ */
+ if (rw == WRITE) {
+
+ down_write(&s->lock);
+
+ /* If the block is already remapped - use that, else remap it */
+ e = lookup_exception(&s->complete, chunk);
+ if (e)
+ remap_exception(s, e, bh);
+
+ else {
+ pe = find_pending_exception(s, bh);
+
+ if (!pe) {
+ s->store.drop_snapshot(&s->store);
+ s->valid = 0;
+ }
+
+ queue_buffer(&pe->snapshot_bhs, bh);
+ start_copy(pe);
+ r = 0;
+ }
+
+ up_write(&s->lock);
+
+ } else {
+ /*
+ * FIXME: this read path scares me because we
+ * always use the origin when we have a pending
+ * exception. However I can't think of a
+ * situation where this is wrong - ejt.
+ */
+
+ /* Do reads */
+ down_read(&s->lock);
+
+ /* See if it it has been remapped */
+ e = lookup_exception(&s->complete, chunk);
+ if (e)
+ remap_exception(s, e, bh);
+ else
+ bh->b_rdev = s->origin->dev;
+
+ up_read(&s->lock);
+ }
+
+ return r;
+}
+
+static void list_merge(struct list_head *l1, struct list_head *l2)
+{
+ struct list_head *l1_n, *l2_p;
+
+ l1_n = l1->next;
+ l2_p = l2->prev;
+
+ l1->next = l2;
+ l2->prev = l1;
+
+ l2_p->next = l1_n;
+ l1_n->prev = l2_p;
+}
+
+static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
+{
+ int r = 1;
+ struct list_head *sl;
+ struct dm_snapshot *snap;
+ struct exception *e;
+ struct pending_exception *pe, *last = NULL;
+ chunk_t chunk;
+
+ /* Do all the snapshots on this origin */
+ list_for_each(sl, snapshots) {
+ snap = list_entry(sl, struct dm_snapshot, list);
+
+ /* Only deal with valid snapshots */
+ if (!snap->valid)
+ continue;
+
+ down_write(&snap->lock);
+
+ /*
+ * Remember, different snapshots can have
+ * different chunk sizes.
+ */
+ chunk = sector_to_chunk(snap, bh->b_rsector);
+
+ /*
+ * Check exception table to see if block
+ * is already remapped in this snapshot
+ * and trigger an exception if not.
+ */
+ e = lookup_exception(&snap->complete, chunk);
+ if (!e) {
+ pe = find_pending_exception(snap, bh);
+ if (!pe) {
+ snap->store.drop_snapshot(&snap->store);
+ snap->valid = 0;
+
+ } else {
+ if (last)
+ list_merge(&pe->siblings,
+ &last->siblings);
+
+ last = pe;
+ r = 0;
+ }
+ }
+
+ up_write(&snap->lock);
+ }
+
+ /*
+ * Now that we have a complete pe list we can start the copying.
+ */
+ if (last) {
+ pe = last;
+ do {
+ down_write(&pe->snap->lock);
+ queue_buffer(&pe->origin_bhs, bh);
+ start_copy(pe);
+ up_write(&pe->snap->lock);
+ pe = list_entry(pe->siblings.next,
+ struct pending_exception, siblings);
+
+ } while (pe != last);
+ }
+
+ return r;
+}
+
+static int snapshot_status(struct dm_target *ti, status_type_t type,
+ char *result, int maxlen)
+{
+ struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
+ char cow[16];
+ char org[16];
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ if (!snap->valid)
+ snprintf(result, maxlen, "Invalid");
+ else {
+ if (snap->store.percent_full)
+ snprintf(result, maxlen, "%d%%",
+ snap->store.percent_full(&snap->
+ store));
+ else
+ snprintf(result, maxlen, "Unknown");
+ }
+ break;
+
+ case STATUSTYPE_TABLE:
+ /*
+ * kdevname returns a static pointer so we need
+ * to make private copies if the output is to
+ * make sense.
+ */
+ strncpy(cow, kdevname(snap->cow->dev), sizeof(cow));
+ strncpy(org, kdevname(snap->origin->dev), sizeof(org));
+ snprintf(result, maxlen, "%s %s %c %ld", org, cow,
+ snap->type, snap->chunk_size);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Called on a write from the origin driver.
+ */
+int do_origin(struct dm_dev *origin, struct buffer_head *bh)
+{
+ struct origin *o;
+ int r;
+
+ down_read(&_origins_lock);
+ o = __lookup_origin(origin->dev);
+ if (!o)
+ BUG();
+
+ r = __origin_write(&o->snapshots, bh);
+ up_read(&_origins_lock);
+
+ return r;
+}
+
+/*
+ * Origin: maps a linear range of a device, with hooks for snapshotting.
+ */
+
+/*
+ * Construct an origin mapping: <dev_path>
+ * The context for an origin is merely a 'struct dm_dev *'
+ * pointing to the real device.
+ */
+static int origin_ctr(struct dm_target *ti, int argc, char **argv)
+{
+ int r;
+ struct dm_dev *dev;
+
+ if (argc != 1) {
+ ti->error = "dm-origin: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ r = dm_get_device(ti, argv[0], 0, ti->len,
+ dm_table_get_mode(ti->table), &dev);
+ if (r) {
+ ti->error = "Cannot get target device";
+ return r;
+ }
+
+ ti->private = dev;
+
+ return 0;
+}
+
+static void origin_dtr(struct dm_target *ti)
+{
+ struct dm_dev *dev = (struct dm_dev *) ti->private;
+ dm_put_device(ti, dev);
+}
+
+static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw)
+{
+ struct dm_dev *dev = (struct dm_dev *) ti->private;
+ bh->b_rdev = dev->dev;
+
+ /* Only tell snapshots if this is a write */
+ return (rw == WRITE) ? do_origin(dev, bh) : 1;
+}
+
+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
+ int maxlen)
+{
+ struct dm_dev *dev = (struct dm_dev *) ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ snprintf(result, maxlen, "%s", kdevname(dev->dev));
+ break;
+ }
+
+ return 0;
+}
+
+static struct target_type origin_target = {
+ name: "snapshot-origin",
+ module: THIS_MODULE,
+ ctr: origin_ctr,
+ dtr: origin_dtr,
+ map: origin_map,
+ status: origin_status,
+};
+
+static struct target_type snapshot_target = {
+ name: "snapshot",
+ module: THIS_MODULE,
+ ctr: snapshot_ctr,
+ dtr: snapshot_dtr,
+ map: snapshot_map,
+ status: snapshot_status,
+};
+
+int __init dm_snapshot_init(void)
+{
+ int r;
+
+ r = dm_register_target(&snapshot_target);
+ if (r) {
+ DMERR("snapshot target register failed %d", r);
+ return r;
+ }
+
+ r = dm_register_target(&origin_target);
+ if (r < 0) {
+ DMERR("Device mapper: Origin: register failed %d\n", r);
+ goto bad1;
+ }
+
+ r = init_origin_hash();
+ if (r) {
+ DMERR("init_origin_hash failed.");
+ goto bad2;
+ }
+
+ exception_cache = kmem_cache_create("dm-snapshot-ex",
+ sizeof(struct exception),
+ __alignof__(struct exception),
+ 0, NULL, NULL);
+ if (!exception_cache) {
+ DMERR("Couldn't create exception cache.");
+ r = -ENOMEM;
+ goto bad3;
+ }
+
+ pending_cache =
+ kmem_cache_create("dm-snapshot-in",
+ sizeof(struct pending_exception),
+ __alignof__(struct pending_exception),
+ 0, NULL, NULL);
+ if (!pending_cache) {
+ DMERR("Couldn't create pending cache.");
+ r = -ENOMEM;
+ goto bad4;
+ }
+
+ pending_pool = mempool_create(128, mempool_alloc_slab,
+ mempool_free_slab, pending_cache);
+ if (!pending_pool) {
+ DMERR("Couldn't create pending pool.");
+ r = -ENOMEM;
+ goto bad5;
+ }
+
+ return 0;
+
+ bad5:
+ kmem_cache_destroy(pending_cache);
+ bad4:
+ kmem_cache_destroy(exception_cache);
+ bad3:
+ exit_origin_hash();
+ bad2:
+ dm_unregister_target(&origin_target);
+ bad1:
+ dm_unregister_target(&snapshot_target);
+ return r;
+}
+
+void dm_snapshot_exit(void)
+{
+ int r;
+
+ r = dm_unregister_target(&snapshot_target);
+ if (r)
+ DMERR("snapshot unregister failed %d", r);
+
+ r = dm_unregister_target(&origin_target);
+ if (r)
+ DMERR("origin unregister failed %d", r);
+
+ exit_origin_hash();
+ mempool_destroy(pending_pool);
+ kmem_cache_destroy(pending_cache);
+ kmem_cache_destroy(exception_cache);
+}
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- diff/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-snapshot.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,147 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SNAPSHOT_H
+#define DM_SNAPSHOT_H
+
+#include "dm.h"
+#include <linux/blkdev.h>
+
+struct exception_table {
+ uint32_t hash_mask;
+ struct list_head *table;
+};
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 64k - 256k.
+ */
+/* FIXME: can we get away with limiting these to a uint32_t ? */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ */
+struct exception {
+ struct list_head hash_list;
+
+ chunk_t old_chunk;
+ chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct exception_store {
+
+ /*
+ * Destroys this object when you've finished with it.
+ */
+ void (*destroy) (struct exception_store *store);
+
+ /*
+ * Find somewhere to store the next exception.
+ */
+ int (*prepare_exception) (struct exception_store *store,
+ struct exception *e);
+
+ /*
+ * Update the metadata with this exception.
+ */
+ void (*commit_exception) (struct exception_store *store,
+ struct exception *e,
+ void (*callback) (void *, int success),
+ void *callback_context);
+
+ /*
+ * The snapshot is invalid, note this in the metadata.
+ */
+ void (*drop_snapshot) (struct exception_store *store);
+
+ /*
+ * Return the %age full of the snapshot
+ */
+ int (*percent_full) (struct exception_store *store);
+
+ struct dm_snapshot *snap;
+ void *context;
+};
+
+struct dm_snapshot {
+ struct rw_semaphore lock;
+ struct dm_table *table;
+
+ struct dm_dev *origin;
+ struct dm_dev *cow;
+
+ /* List of snapshots per Origin */
+ struct list_head list;
+
+ /* Size of data blocks saved - must be a power of 2 */
+ chunk_t chunk_size;
+ chunk_t chunk_mask;
+ chunk_t chunk_shift;
+
+ /* You can't use a snapshot if this is 0 (e.g. if full) */
+ int valid;
+
+ /* Used for display of table */
+ char type;
+
+ /* The last percentage we notified */
+ int last_percent;
+
+ struct exception_table pending;
+ struct exception_table complete;
+
+ /* The on disk metadata handler */
+ struct exception_store store;
+};
+
+/*
+ * Used by the exception stores to load exceptions hen
+ * initialising.
+ */
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
+
+/*
+ * Constructor and destructor for the default persistent
+ * store.
+ */
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+
+int dm_create_transient(struct exception_store *store,
+ struct dm_snapshot *s, int blocksize);
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(kdev_t dev)
+{
+ int *sizes;
+
+ sizes = blk_size[MAJOR(dev)];
+ if (sizes)
+ return sizes[MINOR(dev)] << 1;
+
+ return 0;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
+{
+ return (sector & ~s->chunk_mask) >> s->chunk_shift;
+}
+
+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
+{
+ return chunk << s->chunk_shift;
+}
+
+#endif
--- diff/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-stripe.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+struct stripe {
+ struct dm_dev *dev;
+ sector_t physical_start;
+};
+
+struct stripe_c {
+ uint32_t stripes;
+
+ /* The size of this target / num. stripes */
+ uint32_t stripe_width;
+
+ /* stripe chunk size */
+ uint32_t chunk_shift;
+ sector_t chunk_mask;
+
+ struct stripe stripe[0];
+};
+
+static inline struct stripe_c *alloc_context(int stripes)
+{
+ size_t len;
+
+ if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+ stripes))
+ return NULL;
+
+ len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
+
+ return kmalloc(len, GFP_KERNEL);
+}
+
+/*
+ * Parse a single <dev> <sector> pair
+ */
+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
+ int stripe, char **argv)
+{
+ sector_t start;
+
+ if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
+ return -EINVAL;
+
+ if (dm_get_device(ti, argv[0], start, sc->stripe_width,
+ dm_table_get_mode(ti->table),
+ &sc->stripe[stripe].dev))
+ return -ENXIO;
+
+ sc->stripe[stripe].physical_start = start;
+ return 0;
+}
+
+/*
+ * FIXME: Nasty function, only present because we can't link
+ * against __moddi3 and __divdi3.
+ *
+ * returns a == b * n
+ */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+ sector_t acc, prev, i;
+
+ *n = 0;
+ while (a >= b) {
+ for (acc = b, prev = 0, i = 1;
+ acc <= a;
+ prev = acc, acc <<= 1, i <<= 1)
+ ;
+
+ a -= prev;
+ *n += i >> 1;
+ }
+
+ return a == 0;
+}
+
+/*
+ * Construct a striped mapping.
+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
+ */
+static int stripe_ctr(struct dm_target *ti, int argc, char **argv)
+{
+ struct stripe_c *sc;
+ sector_t width;
+ uint32_t stripes;
+ uint32_t chunk_size;
+ char *end;
+ int r, i;
+
+ if (argc < 2) {
+ ti->error = "dm-stripe: Not enough arguments";
+ return -EINVAL;
+ }
+
+ stripes = simple_strtoul(argv[0], &end, 10);
+ if (*end) {
+ ti->error = "dm-stripe: Invalid stripe count";
+ return -EINVAL;
+ }
+
+ chunk_size = simple_strtoul(argv[1], &end, 10);
+ if (*end) {
+ ti->error = "dm-stripe: Invalid chunk_size";
+ return -EINVAL;
+ }
+
+ if (!multiple(ti->len, stripes, &width)) {
+ ti->error = "dm-stripe: Target length not divisable by "
+ "number of stripes";
+ return -EINVAL;
+ }
+
+ sc = alloc_context(stripes);
+ if (!sc) {
+ ti->error = "dm-stripe: Memory allocation for striped context "
+ "failed";
+ return -ENOMEM;
+ }
+
+ sc->stripes = stripes;
+ sc->stripe_width = width;
+
+ /*
+ * chunk_size is a power of two
+ */
+ if (!chunk_size || (chunk_size & (chunk_size - 1))) {
+ ti->error = "dm-stripe: Invalid chunk size";
+ kfree(sc);
+ return -EINVAL;
+ }
+
+ sc->chunk_mask = ((sector_t) chunk_size) - 1;
+ for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
+ chunk_size >>= 1;
+ sc->chunk_shift--;
+
+ /*
+ * Get the stripe destinations.
+ */
+ for (i = 0; i < stripes; i++) {
+ if (argc < 2) {
+ ti->error = "dm-stripe: Not enough destinations "
+ "specified";
+ kfree(sc);
+ return -EINVAL;
+ }
+
+ argv += 2;
+
+ r = get_stripe(ti, sc, i, argv);
+ if (r < 0) {
+ ti->error = "dm-stripe: Couldn't parse stripe "
+ "destination";
+ while (i--)
+ dm_put_device(ti, sc->stripe[i].dev);
+ kfree(sc);
+ return r;
+ }
+ }
+
+ ti->private = sc;
+ return 0;
+}
+
+static void stripe_dtr(struct dm_target *ti)
+{
+ unsigned int i;
+ struct stripe_c *sc = (struct stripe_c *) ti->private;
+
+ for (i = 0; i < sc->stripes; i++)
+ dm_put_device(ti, sc->stripe[i].dev);
+
+ kfree(sc);
+}
+
+static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw)
+{
+ struct stripe_c *sc = (struct stripe_c *) ti->private;
+
+ sector_t offset = bh->b_rsector - ti->begin;
+ uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
+ uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
+ chunk = chunk / sc->stripes;
+
+ bh->b_rdev = sc->stripe[stripe].dev->dev;
+ bh->b_rsector = sc->stripe[stripe].physical_start +
+ (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
+ return 1;
+}
+
+static int stripe_status(struct dm_target *ti,
+ status_type_t type, char *result, int maxlen)
+{
+ struct stripe_c *sc = (struct stripe_c *) ti->private;
+ int offset;
+ int i;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
+ sc->stripes, sc->chunk_mask + 1);
+ for (i = 0; i < sc->stripes; i++) {
+ offset +=
+ snprintf(result + offset, maxlen - offset,
+ " %s " SECTOR_FORMAT,
+ kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
+ sc->stripe[i].physical_start);
+ }
+ break;
+ }
+ return 0;
+}
+
+static struct target_type stripe_target = {
+ .name = "striped",
+ .module = THIS_MODULE,
+ .ctr = stripe_ctr,
+ .dtr = stripe_dtr,
+ .map = stripe_map,
+ .status = stripe_status,
+};
+
+int __init dm_stripe_init(void)
+{
+ int r;
+
+ r = dm_register_target(&stripe_target);
+ if (r < 0)
+ DMWARN("striped target registration failed");
+
+ return r;
+}
+
+void dm_stripe_exit(void)
+{
+ if (dm_unregister_target(&stripe_target))
+ DMWARN("striped target unregistration failed");
+
+ return;
+}
--- diff/drivers/md/dm-table.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-table.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,668 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+
+#define MAX_DEPTH 16
+#define NODE_SIZE L1_CACHE_BYTES
+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
+
+struct dm_table {
+ atomic_t holders;
+
+ /* btree table */
+ int depth;
+ int counts[MAX_DEPTH]; /* in nodes */
+ sector_t *index[MAX_DEPTH];
+
+ int num_targets;
+ int num_allocated;
+ sector_t *highs;
+ struct dm_target *targets;
+
+ /*
+ * Indicates the rw permissions for the new logical
+ * device. This should be a combination of FMODE_READ
+ * and FMODE_WRITE.
+ */
+ int mode;
+
+ /* a list of devices used by this table */
+ struct list_head devices;
+
+ /*
+ * A waitqueue for processes waiting for something
+ * interesting to happen to this table.
+ */
+ wait_queue_head_t eventq;
+};
+
+/*
+ * Ceiling(n / size)
+ */
+static inline unsigned long div_up(unsigned long n, unsigned long size)
+{
+ return dm_round_up(n, size) / size;
+}
+
+/*
+ * Similar to ceiling(log_size(n))
+ */
+static unsigned int int_log(unsigned long n, unsigned long base)
+{
+ int result = 0;
+
+ while (n > 1) {
+ n = div_up(n, base);
+ result++;
+ }
+
+ return result;
+}
+
+/*
+ * Calculate the index of the child node of the n'th node k'th key.
+ */
+static inline int get_child(int n, int k)
+{
+ return (n * CHILDREN_PER_NODE) + k;
+}
+
+/*
+ * Return the n'th node of level l from table t.
+ */
+static inline sector_t *get_node(struct dm_table *t, int l, int n)
+{
+ return t->index[l] + (n * KEYS_PER_NODE);
+}
+
+/*
+ * Return the highest key that you could lookup from the n'th
+ * node on level l of the btree.
+ */
+static sector_t high(struct dm_table *t, int l, int n)
+{
+ for (; l < t->depth - 1; l++)
+ n = get_child(n, CHILDREN_PER_NODE - 1);
+
+ if (n >= t->counts[l])
+ return (sector_t) - 1;
+
+ return get_node(t, l, n)[KEYS_PER_NODE - 1];
+}
+
+/*
+ * Fills in a level of the btree based on the highs of the level
+ * below it.
+ */
+static int setup_btree_index(int l, struct dm_table *t)
+{
+ int n, k;
+ sector_t *node;
+
+ for (n = 0; n < t->counts[l]; n++) {
+ node = get_node(t, l, n);
+
+ for (k = 0; k < KEYS_PER_NODE; k++)
+ node[k] = high(t, l + 1, get_child(n, k));
+ }
+
+ return 0;
+}
+
+/*
+ * highs, and targets are managed as dynamic arrays during a
+ * table load.
+ */
+static int alloc_targets(struct dm_table *t, int num)
+{
+ sector_t *n_highs;
+ struct dm_target *n_targets;
+ int n = t->num_targets;
+
+ /*
+ * Allocate both the target array and offset array at once.
+ */
+ n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
+ sizeof(sector_t),
+ num);
+ if (!n_highs)
+ return -ENOMEM;
+
+ n_targets = (struct dm_target *) (n_highs + num);
+
+ if (n) {
+ memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
+ memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
+ }
+
+ memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
+ vfree(t->highs);
+
+ t->num_allocated = num;
+ t->highs = n_highs;
+ t->targets = n_targets;
+
+ return 0;
+}
+
+int dm_table_create(struct dm_table **result, int mode)
+{
+ struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
+
+ if (!t)
+ return -ENOMEM;
+
+ memset(t, 0, sizeof(*t));
+ INIT_LIST_HEAD(&t->devices);
+ atomic_set(&t->holders, 1);
+
+ /* allocate a single nodes worth of targets to begin with */
+ if (alloc_targets(t, KEYS_PER_NODE)) {
+ kfree(t);
+ t = NULL;
+ return -ENOMEM;
+ }
+
+ init_waitqueue_head(&t->eventq);
+ t->mode = mode;
+ *result = t;
+ return 0;
+}
+
+static void free_devices(struct list_head *devices)
+{
+ struct list_head *tmp, *next;
+
+ for (tmp = devices->next; tmp != devices; tmp = next) {
+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+ next = tmp->next;
+ kfree(dd);
+ }
+}
+
+void table_destroy(struct dm_table *t)
+{
+ int i;
+
+ DMWARN("destroying table");
+
+ /* destroying the table counts as an event */
+ dm_table_event(t);
+
+ /* free the indexes (see dm_table_complete) */
+ if (t->depth >= 2)
+ vfree(t->index[t->depth - 2]);
+
+ /* free the targets */
+ for (i = 0; i < t->num_targets; i++) {
+ struct dm_target *tgt = &t->targets[i];
+
+ dm_put_target_type(t->targets[i].type);
+
+ if (tgt->type->dtr)
+ tgt->type->dtr(tgt);
+ }
+
+ vfree(t->highs);
+
+ /* free the device list */
+ if (t->devices.next != &t->devices) {
+ DMWARN("devices still present during destroy: "
+ "dm_table_remove_device calls missing");
+
+ free_devices(&t->devices);
+ }
+
+ kfree(t);
+}
+
+void dm_table_get(struct dm_table *t)
+{
+ atomic_inc(&t->holders);
+}
+
+void dm_table_put(struct dm_table *t)
+{
+ if (atomic_dec_and_test(&t->holders))
+ table_destroy(t);
+}
+
+/*
+ * Checks to see if we need to extend highs or targets.
+ */
+static inline int check_space(struct dm_table *t)
+{
+ if (t->num_targets >= t->num_allocated)
+ return alloc_targets(t, t->num_allocated * 2);
+
+ return 0;
+}
+
+/*
+ * Convert a device path to a dev_t.
+ */
+static int lookup_device(const char *path, kdev_t *dev)
+{
+ int r;
+ struct nameidata nd;
+ struct inode *inode;
+
+ if (!path_init(path, LOOKUP_FOLLOW, &nd))
+ return 0;
+
+ if ((r = path_walk(path, &nd)))
+ goto out;
+
+ inode = nd.dentry->d_inode;
+ if (!inode) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ if (!S_ISBLK(inode->i_mode)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ *dev = inode->i_rdev;
+
+ out:
+ path_release(&nd);
+ return r;
+}
+
+/*
+ * See if we've already got a device in the list.
+ */
+static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
+{
+ struct list_head *tmp;
+
+ list_for_each(tmp, l) {
+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+ if (kdev_same(dd->dev, dev))
+ return dd;
+ }
+
+ return NULL;
+}
+
+/*
+ * Open a device so we can use it as a map destination.
+ */
+static int open_dev(struct dm_dev *dd)
+{
+ if (dd->bdev)
+ BUG();
+
+ dd->bdev = bdget(kdev_t_to_nr(dd->dev));
+ if (!dd->bdev)
+ return -ENOMEM;
+
+ return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
+}
+
+/*
+ * Close a device that we've been using.
+ */
+static void close_dev(struct dm_dev *dd)
+{
+ if (!dd->bdev)
+ return;
+
+ blkdev_put(dd->bdev, BDEV_RAW);
+ dd->bdev = NULL;
+}
+
+/*
+ * If possible (ie. blk_size[major] is set), this checks an area
+ * of a destination device is valid.
+ */
+static int check_device_area(kdev_t dev, sector_t start, sector_t len)
+{
+ int *sizes;
+ sector_t dev_size;
+
+ if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
+ /* we don't know the device details,
+ * so give the benefit of the doubt */
+ return 1;
+
+ /* convert to 512-byte sectors */
+ dev_size <<= 1;
+
+ return ((start < dev_size) && (len <= (dev_size - start)));
+}
+
+/*
+ * This upgrades the mode on an already open dm_dev. Being
+ * careful to leave things as they were if we fail to reopen the
+ * device.
+ */
+static int upgrade_mode(struct dm_dev *dd, int new_mode)
+{
+ int r;
+ struct dm_dev dd_copy;
+
+ memcpy(&dd_copy, dd, sizeof(dd_copy));
+
+ dd->mode |= new_mode;
+ dd->bdev = NULL;
+ r = open_dev(dd);
+ if (!r)
+ close_dev(&dd_copy);
+ else
+ memcpy(dd, &dd_copy, sizeof(dd_copy));
+
+ return r;
+}
+
+/*
+ * Add a device to the list, or just increment the usage count if
+ * it's already present.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+ sector_t len, int mode, struct dm_dev **result)
+{
+ int r;
+ kdev_t dev;
+ struct dm_dev *dd;
+ int major, minor;
+ struct dm_table *t = ti->table;
+
+ if (!t)
+ BUG();
+
+ if (sscanf(path, "%x:%x", &major, &minor) == 2) {
+ /* Extract the major/minor numbers */
+ dev = mk_kdev(major, minor);
+ } else {
+ /* convert the path to a device */
+ if ((r = lookup_device(path, &dev)))
+ return r;
+ }
+
+ dd = find_device(&t->devices, dev);
+ if (!dd) {
+ dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+ if (!dd)
+ return -ENOMEM;
+
+ dd->dev = dev;
+ dd->mode = mode;
+ dd->bdev = NULL;
+
+ if ((r = open_dev(dd))) {
+ kfree(dd);
+ return r;
+ }
+
+ atomic_set(&dd->count, 0);
+ list_add(&dd->list, &t->devices);
+
+ } else if (dd->mode != (mode | dd->mode)) {
+ r = upgrade_mode(dd, mode);
+ if (r)
+ return r;
+ }
+ atomic_inc(&dd->count);
+
+ if (!check_device_area(dd->dev, start, len)) {
+ DMWARN("device %s too small for target", path);
+ dm_put_device(ti, dd);
+ return -EINVAL;
+ }
+
+ *result = dd;
+
+ return 0;
+}
+
+/*
+ * Decrement a devices use count and remove it if neccessary.
+ */
+void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
+{
+ if (atomic_dec_and_test(&dd->count)) {
+ close_dev(dd);
+ list_del(&dd->list);
+ kfree(dd);
+ }
+}
+
+/*
+ * Checks to see if the target joins onto the end of the table.
+ */
+static int adjoin(struct dm_table *table, struct dm_target *ti)
+{
+ struct dm_target *prev;
+
+ if (!table->num_targets)
+ return !ti->begin;
+
+ prev = &table->targets[table->num_targets - 1];
+ return (ti->begin == (prev->begin + prev->len));
+}
+
+/*
+ * Destructively splits up the argument list to pass to ctr.
+ */
+static int split_args(int max, int *argc, char **argv, char *input)
+{
+ char *start, *end = input, *out;
+ *argc = 0;
+
+ while (1) {
+ start = end;
+
+ /* Skip whitespace */
+ while (*start && isspace(*start))
+ start++;
+
+ if (!*start)
+ break; /* success, we hit the end */
+
+ /* 'out' is used to remove any back-quotes */
+ end = out = start;
+ while (*end) {
+ /* Everything apart from '\0' can be quoted */
+ if (*end == '\\' && *(end + 1)) {
+ *out++ = *(end + 1);
+ end += 2;
+ continue;
+ }
+
+ if (isspace(*end))
+ break; /* end of token */
+
+ *out++ = *end++;
+ }
+
+ /* have we already filled the array ? */
+ if ((*argc + 1) > max)
+ return -EINVAL;
+
+ /* we know this is whitespace */
+ if (*end)
+ end++;
+
+ /* terminate the string and put it in the array */
+ *out = '\0';
+ argv[*argc] = start;
+ (*argc)++;
+ }
+
+ return 0;
+}
+
+int dm_table_add_target(struct dm_table *t, const char *type,
+ sector_t start, sector_t len, char *params)
+{
+ int r, argc;
+ char *argv[32];
+ struct target_type *tt;
+ struct dm_target *tgt;
+
+ if ((r = check_space(t)))
+ return r;
+
+ tgt = t->targets + t->num_targets;
+ memset(tgt, 0, sizeof(*tgt));
+
+ tt = dm_get_target_type(type);
+ if (!tt) {
+ tgt->error = "unknown target type";
+ return -EINVAL;
+ }
+
+ tgt->table = t;
+ tgt->type = tt;
+ tgt->begin = start;
+ tgt->len = len;
+ tgt->error = "Unknown error";
+
+ /*
+ * Does this target adjoin the previous one ?
+ */
+ if (!adjoin(t, tgt)) {
+ DMERR("Gap in table");
+ dm_put_target_type(tt);
+ return -EINVAL;
+ }
+
+ r = split_args(ARRAY_SIZE(argv), &argc, argv, params);
+ if (r) {
+ tgt->error = "couldn't split parameters";
+ dm_put_target_type(tt);
+ return r;
+ }
+
+ r = tt->ctr(tgt, argc, argv);
+ if (r) {
+ dm_put_target_type(tt);
+ return r;
+ }
+
+ t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
+ return 0;
+}
+
+static int setup_indexes(struct dm_table *t)
+{
+ int i, total = 0;
+ sector_t *indexes;
+
+ /* allocate the space for *all* the indexes */
+ for (i = t->depth - 2; i >= 0; i--) {
+ t->counts[i] = div_up(t->counts[i + 1], CHILDREN_PER_NODE);
+ total += t->counts[i];
+ }
+
+ indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
+ if (!indexes)
+ return -ENOMEM;
+
+ /* set up internal nodes, bottom-up */
+ for (i = t->depth - 2, total = 0; i >= 0; i--) {
+ t->index[i] = indexes;
+ indexes += (KEYS_PER_NODE * t->counts[i]);
+ setup_btree_index(i, t);
+ }
+
+ return 0;
+}
+
+/*
+ * Builds the btree to index the map.
+ */
+int dm_table_complete(struct dm_table *t)
+{
+ int leaf_nodes, r = 0;
+
+ /* how many indexes will the btree have ? */
+ leaf_nodes = div_up(t->num_targets, KEYS_PER_NODE);
+ t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
+
+ /* leaf layer has already been set up */
+ t->counts[t->depth - 1] = leaf_nodes;
+ t->index[t->depth - 1] = t->highs;
+
+ if (t->depth >= 2)
+ r = setup_indexes(t);
+
+ return r;
+}
+
+void dm_table_event(struct dm_table *t)
+{
+ wake_up_interruptible(&t->eventq);
+}
+
+sector_t dm_table_get_size(struct dm_table *t)
+{
+ return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
+}
+
+struct dm_target *dm_table_get_target(struct dm_table *t, int index)
+{
+ if (index > t->num_targets)
+ return NULL;
+
+ return t->targets + index;
+}
+
+/*
+ * Search the btree for the correct target.
+ */
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
+{
+ int l, n = 0, k = 0;
+ sector_t *node;
+
+ for (l = 0; l < t->depth; l++) {
+ n = get_child(n, k);
+ node = get_node(t, l, n);
+
+ for (k = 0; k < KEYS_PER_NODE; k++)
+ if (node[k] >= sector)
+ break;
+ }
+
+ return &t->targets[(KEYS_PER_NODE * n) + k];
+}
+
+unsigned int dm_table_get_num_targets(struct dm_table *t)
+{
+ return t->num_targets;
+}
+
+struct list_head *dm_table_get_devices(struct dm_table *t)
+{
+ return &t->devices;
+}
+
+int dm_table_get_mode(struct dm_table *t)
+{
+ return t->mode;
+}
+
+void dm_table_add_wait_queue(struct dm_table *t, wait_queue_t *wq)
+{
+ add_wait_queue(&t->eventq, wq);
+}
+
+EXPORT_SYMBOL(dm_get_device);
+EXPORT_SYMBOL(dm_put_device);
+EXPORT_SYMBOL(dm_table_event);
--- diff/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm-target.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+
+struct tt_internal {
+ struct target_type tt;
+
+ struct list_head list;
+ long use;
+};
+
+static LIST_HEAD(_targets);
+static rwlock_t _lock = RW_LOCK_UNLOCKED;
+
+#define DM_MOD_NAME_SIZE 32
+
+static inline struct tt_internal *__find_target_type(const char *name)
+{
+ struct list_head *tih;
+ struct tt_internal *ti;
+
+ list_for_each(tih, &_targets) {
+ ti = list_entry(tih, struct tt_internal, list);
+
+ if (!strcmp(name, ti->tt.name))
+ return ti;
+ }
+
+ return NULL;
+}
+
+static struct tt_internal *get_target_type(const char *name)
+{
+ struct tt_internal *ti;
+
+ read_lock(&_lock);
+ ti = __find_target_type(name);
+
+ if (ti) {
+ if (ti->use == 0 && ti->tt.module)
+ __MOD_INC_USE_COUNT(ti->tt.module);
+ ti->use++;
+ }
+ read_unlock(&_lock);
+
+ return ti;
+}
+
+static void load_module(const char *name)
+{
+ char module_name[DM_MOD_NAME_SIZE] = "dm-";
+
+ /* Length check for strcat() below */
+ if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
+ return;
+
+ strcat(module_name, name);
+ request_module(module_name);
+
+ return;
+}
+
+struct target_type *dm_get_target_type(const char *name)
+{
+ struct tt_internal *ti = get_target_type(name);
+
+ if (!ti) {
+ load_module(name);
+ ti = get_target_type(name);
+ }
+
+ return ti ? &ti->tt : NULL;
+}
+
+void dm_put_target_type(struct target_type *t)
+{
+ struct tt_internal *ti = (struct tt_internal *) t;
+
+ read_lock(&_lock);
+ if (--ti->use == 0 && ti->tt.module)
+ __MOD_DEC_USE_COUNT(ti->tt.module);
+
+ if (ti->use < 0)
+ BUG();
+ read_unlock(&_lock);
+
+ return;
+}
+
+static struct tt_internal *alloc_target(struct target_type *t)
+{
+ struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+ if (ti) {
+ memset(ti, 0, sizeof(*ti));
+ ti->tt = *t;
+ }
+
+ return ti;
+}
+
+int dm_register_target(struct target_type *t)
+{
+ int rv = 0;
+ struct tt_internal *ti = alloc_target(t);
+
+ if (!ti)
+ return -ENOMEM;
+
+ write_lock(&_lock);
+ if (__find_target_type(t->name))
+ rv = -EEXIST;
+ else
+ list_add(&ti->list, &_targets);
+
+ write_unlock(&_lock);
+ return rv;
+}
+
+int dm_unregister_target(struct target_type *t)
+{
+ struct tt_internal *ti;
+
+ write_lock(&_lock);
+ if (!(ti = __find_target_type(t->name))) {
+ write_unlock(&_lock);
+ return -EINVAL;
+ }
+
+ if (ti->use) {
+ write_unlock(&_lock);
+ return -ETXTBSY;
+ }
+
+ list_del(&ti->list);
+ kfree(ti);
+
+ write_unlock(&_lock);
+ return 0;
+}
+
+/*
+ * io-err: always fails an io, useful for bringing
+ * up LVs that have holes in them.
+ */
+static int io_err_ctr(struct dm_target *ti, int argc, char **args)
+{
+ return 0;
+}
+
+static void io_err_dtr(struct dm_target *ti)
+{
+ /* empty */
+ return;
+}
+
+static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw)
+{
+ buffer_IO_error(bh);
+ return 0;
+}
+
+static struct target_type error_target = {
+ .name = "error",
+ .ctr = io_err_ctr,
+ .dtr = io_err_dtr,
+ .map = io_err_map,
+};
+
+int dm_target_init(void)
+{
+ return dm_register_target(&error_target);
+}
+
+void dm_target_exit(void)
+{
+ if (dm_unregister_target(&error_target))
+ DMWARN("error target unregistration failed");
+}
+
+EXPORT_SYMBOL(dm_register_target);
+EXPORT_SYMBOL(dm_unregister_target);
--- diff/drivers/md/dm.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,862 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/blk.h>
+#include <linux/blkpg.h>
+#include <linux/mempool.h>
+#include <linux/slab.h>
+#include <linux/kdev_t.h>
+#include <linux/lvm.h>
+
+#include <asm/uaccess.h>
+
+static const char *_name = DM_NAME;
+#define MAX_DEVICES (1 << MINORBITS)
+#define SECTOR_SHIFT 9
+#define DEFAULT_READ_AHEAD 64
+
+static int major = 0;
+static int _major = 0;
+
+struct dm_io {
+ struct mapped_device *md;
+
+ void (*end_io) (struct buffer_head * bh, int uptodate);
+ void *context;
+};
+
+struct deferred_io {
+ int rw;
+ struct buffer_head *bh;
+ struct deferred_io *next;
+};
+
+/*
+ * Bits for the md->flags field.
+ */
+#define DMF_BLOCK_IO 0
+#define DMF_SUSPENDED 1
+
+struct mapped_device {
+ struct rw_semaphore lock;
+ atomic_t holders;
+
+ kdev_t dev;
+ unsigned long flags;
+
+ /*
+ * A list of ios that arrived while we were suspended.
+ */
+ atomic_t pending;
+ wait_queue_head_t wait;
+ struct deferred_io *deferred;
+
+ /*
+ * The current mapping.
+ */
+ struct dm_table *map;
+};
+
+#define MIN_IOS 256
+static kmem_cache_t *_io_cache;
+static mempool_t *_io_pool;
+
+/* block device arrays */
+static int _block_size[MAX_DEVICES];
+static int _blksize_size[MAX_DEVICES];
+static int _hardsect_size[MAX_DEVICES];
+
+static struct mapped_device *get_kdev(kdev_t dev);
+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
+
+
+static __init int local_init(void)
+{
+ int r;
+
+ /* allocate a slab for the dm_ios */
+ _io_cache = kmem_cache_create("dm io",
+ sizeof(struct dm_io), 0, 0, NULL, NULL);
+
+ if (!_io_cache)
+ return -ENOMEM;
+
+ _io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+ mempool_free_slab, _io_cache);
+ if (!_io_pool) {
+ kmem_cache_destroy(_io_cache);
+ return -ENOMEM;
+ }
+
+ _major = major;
+ r = register_blkdev(_major, _name, &dm_blk_dops);
+ if (r < 0) {
+ DMERR("register_blkdev failed");
+ mempool_destroy(_io_pool);
+ kmem_cache_destroy(_io_cache);
+ return r;
+ }
+
+ if (!_major)
+ _major = r;
+
+ /* set up the arrays */
+ read_ahead[_major] = DEFAULT_READ_AHEAD;
+ blk_size[_major] = _block_size;
+ blksize_size[_major] = _blksize_size;
+ hardsect_size[_major] = _hardsect_size;
+
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(_major), dm_request);
+
+ return 0;
+}
+
+static void local_exit(void)
+{
+ mempool_destroy(_io_pool);
+ kmem_cache_destroy(_io_cache);
+
+ if (unregister_blkdev(_major, _name) < 0)
+ DMERR("devfs_unregister_blkdev failed");
+
+ read_ahead[_major] = 0;
+ blk_size[_major] = NULL;
+ blksize_size[_major] = NULL;
+ hardsect_size[_major] = NULL;
+ _major = 0;
+
+ DMINFO("cleaned up");
+}
+
+/*
+ * We have a lot of init/exit functions, so it seems easier to
+ * store them in an array. The disposable macro 'xx'
+ * expands a prefix into a pair of function names.
+ */
+static struct {
+ int (*init) (void);
+ void (*exit) (void);
+
+} _inits[] = {
+#define xx(n) {n ## _init, n ## _exit},
+ xx(local)
+ xx(dm_target)
+ xx(dm_linear)
+ xx(dm_stripe)
+ xx(dm_snapshot)
+ xx(dm_interface)
+#undef xx
+};
+
+static int __init dm_init(void)
+{
+ const int count = ARRAY_SIZE(_inits);
+
+ int r, i;
+
+ for (i = 0; i < count; i++) {
+ r = _inits[i].init();
+ if (r)
+ goto bad;
+ }
+
+ return 0;
+
+ bad:
+ while (i--)
+ _inits[i].exit();
+
+ return r;
+}
+
+static void __exit dm_exit(void)
+{
+ int i = ARRAY_SIZE(_inits);
+
+ while (i--)
+ _inits[i].exit();
+}
+
+/*
+ * Block device functions
+ */
+static int dm_blk_open(struct inode *inode, struct file *file)
+{
+ struct mapped_device *md;
+
+ md = get_kdev(inode->i_rdev);
+ if (!md)
+ return -ENXIO;
+
+ return 0;
+}
+
+static int dm_blk_close(struct inode *inode, struct file *file)
+{
+ struct mapped_device *md;
+
+ md = get_kdev(inode->i_rdev);
+ dm_put(md); /* put the reference gained by dm_blk_open */
+ dm_put(md);
+ return 0;
+}
+
+static inline struct dm_io *alloc_io(void)
+{
+ return mempool_alloc(_io_pool, GFP_NOIO);
+}
+
+static inline void free_io(struct dm_io *io)
+{
+ mempool_free(io, _io_pool);
+}
+
+static inline struct deferred_io *alloc_deferred(void)
+{
+ return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
+}
+
+static inline void free_deferred(struct deferred_io *di)
+{
+ kfree(di);
+}
+
+/* In 512-byte units */
+#define VOLUME_SIZE(minor) (_block_size[(minor)] << 1)
+
+/* FIXME: check this */
+static int dm_blk_ioctl(struct inode *inode, struct file *file,
+ uint command, unsigned long a)
+{
+ int minor = MINOR(inode->i_rdev);
+ long size;
+
+ if (minor >= MAX_DEVICES)
+ return -ENXIO;
+
+ switch (command) {
+ case BLKROSET:
+ case BLKROGET:
+ case BLKRASET:
+ case BLKRAGET:
+ case BLKFLSBUF:
+ case BLKSSZGET:
+ //case BLKRRPART: /* Re-read partition tables */
+ //case BLKPG:
+ case BLKELVGET:
+ case BLKELVSET:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ return blk_ioctl(inode->i_rdev, command, a);
+ break;
+
+ case BLKGETSIZE:
+ size = VOLUME_SIZE(minor);
+ if (copy_to_user((void *) a, &size, sizeof(long)))
+ return -EFAULT;
+ break;
+
+ case BLKGETSIZE64:
+ size = VOLUME_SIZE(minor);
+ if (put_user((u64) ((u64) size) << 9, (u64 *) a))
+ return -EFAULT;
+ break;
+
+ case BLKRRPART:
+ return -ENOTTY;
+
+ case LV_BMAP:
+ return dm_user_bmap(inode, (struct lv_bmap *) a);
+
+ default:
+ DMWARN("unknown block ioctl 0x%x", command);
+ return -ENOTTY;
+ }
+
+ return 0;
+}
+
+/*
+ * Add the buffer to the list of deferred io.
+ */
+static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
+{
+ struct deferred_io *di;
+
+ di = alloc_deferred();
+ if (!di)
+ return -ENOMEM;
+
+ down_write(&md->lock);
+
+ if (!test_bit(DMF_SUSPENDED, &md->flags)) {
+ up_write(&md->lock);
+ free_deferred(di);
+ return 1;
+ }
+
+ di->bh = bh;
+ di->rw = rw;
+ di->next = md->deferred;
+ md->deferred = di;
+
+ up_write(&md->lock);
+ return 0; /* deferred successfully */
+}
+
+/*
+ * bh->b_end_io routine that decrements the pending count
+ * and then calls the original bh->b_end_io fn.
+ */
+static void dec_pending(struct buffer_head *bh, int uptodate)
+{
+ struct dm_io *io = bh->b_private;
+
+ if (atomic_dec_and_test(&io->md->pending))
+ /* nudge anyone waiting on suspend queue */
+ wake_up(&io->md->wait);
+
+ bh->b_end_io = io->end_io;
+ bh->b_private = io->context;
+ free_io(io);
+
+ bh->b_end_io(bh, uptodate);
+}
+
+/*
+ * Do the bh mapping for a given leaf
+ */
+static inline int __map_buffer(struct mapped_device *md,
+ int rw, struct buffer_head *bh)
+{
+ int r;
+ struct dm_io *io;
+ struct dm_target *ti;
+
+ ti = dm_table_find_target(md->map, bh->b_rsector);
+ if (!ti)
+ return -EINVAL;
+
+ io = alloc_io();
+ if (!io)
+ return -ENOMEM;
+
+ io->md = md;
+ io->end_io = bh->b_end_io;
+ io->context = bh->b_private;
+
+ r = ti->type->map(ti, bh, rw);
+
+ if (r > 0) {
+ /* hook the end io request fn */
+ atomic_inc(&md->pending);
+ bh->b_end_io = dec_pending;
+ bh->b_private = io;
+
+ } else
+ /* we don't need to hook */
+ free_io(io);
+
+ return r;
+}
+
+static int __request(struct mapped_device *md, int rw, struct buffer_head *bh)
+{
+ int r;
+
+ /*
+ * If we're suspended we have to queue this io for later.
+ */
+ while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ up_read(&md->lock);
+
+ /*
+ * There's no point deferring a read ahead
+ * request, just drop it.
+ */
+ if (rw == READA) {
+ r = -EIO;
+ goto out_no_lock;
+ }
+
+ r = queue_io(md, bh, rw);
+ if (r <= 0)
+ /*
+ * Either an error occurred or we deferred
+ * successfully.
+ */
+ goto out_no_lock;
+
+ /*
+ * We're in a while loop, because someone could
+ * suspend before we get to the following read
+ * lock.
+ */
+ down_read(&md->lock);
+ }
+
+ r = __map_buffer(md, rw, bh);
+
+ out_no_lock:
+ down_read(&md->lock);
+ return r;
+}
+
+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
+{
+ int r;
+ struct mapped_device *md;
+
+ md = get_kdev(bh->b_rdev);
+ if (!md) {
+ buffer_IO_error(bh);
+ return 0;
+ }
+
+ down_read(&md->lock);
+
+ r = __request(md, rw, bh);
+ if (r < 0) {
+ buffer_IO_error(bh);
+ r = 0;
+ }
+
+ up_read(&md->lock);
+ dm_put(md);
+ return r;
+}
+
+static int check_dev_size(kdev_t dev, unsigned long block)
+{
+ /* FIXME: check this */
+ int minor = MINOR(dev);
+ unsigned long max_sector = (_block_size[minor] << 1) + 1;
+ unsigned long sector = (block + 1) * (_blksize_size[minor] >> 9);
+
+ return (sector > max_sector) ? 0 : 1;
+}
+
+/*
+ * Creates a dummy buffer head and maps it (for lilo).
+ */
+static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
+ kdev_t *r_dev, unsigned long *r_block)
+{
+ struct buffer_head bh;
+ struct dm_target *ti;
+ int r;
+
+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ return -EPERM;
+ }
+
+ if (!check_dev_size(dev, block)) {
+ return -EINVAL;
+ }
+
+ /* setup dummy bh */
+ memset(&bh, 0, sizeof(bh));
+ bh.b_blocknr = block;
+ bh.b_dev = bh.b_rdev = dev;
+ bh.b_size = _blksize_size[MINOR(dev)];
+ bh.b_rsector = block * (bh.b_size >> 9);
+
+ /* find target */
+ ti = dm_table_find_target(md->map, bh.b_rsector);
+
+ /* do the mapping */
+ r = ti->type->map(ti, &bh, READ);
+
+ if (!r) {
+ *r_dev = bh.b_rdev;
+ *r_block = bh.b_rsector / (bh.b_size >> 9);
+ }
+
+ return r;
+}
+
+/*
+ * Marshals arguments and results between user and kernel space.
+ */
+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
+{
+ struct mapped_device *md;
+ unsigned long block, r_block;
+ kdev_t r_dev;
+ int r;
+
+ if (get_user(block, &lvb->lv_block))
+ return -EFAULT;
+
+ md = get_kdev(inode->i_rdev);
+ if (!md)
+ return -ENXIO;
+
+ down_read(&md->lock);
+ r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
+ up_read(&md->lock);
+ dm_put(md);
+
+ if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
+ put_user(r_block, &lvb->lv_block)))
+ r = -EFAULT;
+
+ return r;
+}
+
+/*-----------------------------------------------------------------
+ * A bitset is used to keep track of allocated minor numbers.
+ *---------------------------------------------------------------*/
+static spinlock_t _minor_lock = SPIN_LOCK_UNLOCKED;
+static struct mapped_device * _mds[MAX_DEVICES];
+
+static void free_minor(int minor)
+{
+ spin_lock(&_minor_lock);
+ _mds[minor] = NULL;
+ spin_unlock(&_minor_lock);
+}
+
+/*
+ * See if the device with a specific minor # is free.
+ */
+static int specific_minor(int minor, struct mapped_device *md)
+{
+ int r = -EBUSY;
+
+ if (minor >= MAX_DEVICES) {
+ DMWARN("request for a mapped_device beyond MAX_DEVICES (%d)",
+ MAX_DEVICES);
+ return -EINVAL;
+ }
+
+ spin_lock(&_minor_lock);
+ if (!_mds[minor]) {
+ _mds[minor] = md;
+ r = minor;
+ }
+ spin_unlock(&_minor_lock);
+
+ return r;
+}
+
+static int next_free_minor(struct mapped_device *md)
+{
+ int i;
+
+ spin_lock(&_minor_lock);
+ for (i = 0; i < MAX_DEVICES; i++) {
+ if (!_mds[i]) {
+ _mds[i] = md;
+ break;
+ }
+ }
+ spin_unlock(&_minor_lock);
+
+ return (i < MAX_DEVICES) ? i : -EBUSY;
+}
+
+static struct mapped_device *get_kdev(kdev_t dev)
+{
+ struct mapped_device *md;
+
+ if (major(dev) != _major)
+ return NULL;
+
+ spin_lock(_minor_lock);
+ md = _mds[minor(dev)];
+ if (md)
+ dm_get(md);
+ spin_unlock(_minor_lock);
+
+ return md;
+}
+
+/*
+ * Allocate and initialise a blank device with a given minor.
+ */
+static struct mapped_device *alloc_dev(int minor)
+{
+ struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
+
+ if (!md) {
+ DMWARN("unable to allocate device, out of memory.");
+ return NULL;
+ }
+
+ /* get a minor number for the dev */
+ minor = (minor < 0) ? next_free_minor(md) : specific_minor(minor, md);
+ if (minor < 0) {
+ kfree(md);
+ return NULL;
+ }
+
+ DMWARN("allocating minor %d.", minor);
+ memset(md, 0, sizeof(*md));
+ md->dev = mk_kdev(_major, minor);
+ init_rwsem(&md->lock);
+ atomic_set(&md->holders, 1);
+ atomic_set(&md->pending, 0);
+ init_waitqueue_head(&md->wait);
+
+ return md;
+}
+
+static void free_dev(struct mapped_device *md)
+{
+ free_minor(minor(md->dev));
+ kfree(md);
+}
+
+/*
+ * The hardsect size for a mapped device is the largest hardsect size
+ * from the devices it maps onto.
+ */
+static int __find_hardsect_size(struct list_head *devices)
+{
+ int result = 512, size;
+ struct list_head *tmp;
+
+ list_for_each(tmp, devices) {
+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+ size = get_hardsect_size(dd->dev);
+ if (size > result)
+ result = size;
+ }
+
+ return result;
+}
+
+/*
+ * Bind a table to the device.
+ */
+static int __bind(struct mapped_device *md, struct dm_table *t)
+{
+ int minor = minor(md->dev);
+ md->map = t;
+
+ /* in k */
+ _block_size[minor] = dm_table_get_size(t) >> 1;
+ _blksize_size[minor] = BLOCK_SIZE;
+ _hardsect_size[minor] = __find_hardsect_size(dm_table_get_devices(t));
+ register_disk(NULL, md->dev, 1, &dm_blk_dops, _block_size[minor]);
+
+ dm_table_get(t);
+ return 0;
+}
+
+static void __unbind(struct mapped_device *md)
+{
+ int minor = minor(md->dev);
+
+ dm_table_put(md->map);
+ md->map = NULL;
+
+ _block_size[minor] = 0;
+ _blksize_size[minor] = 0;
+ _hardsect_size[minor] = 0;
+}
+
+/*
+ * Constructor for a new device.
+ */
+int dm_create(int minor, struct dm_table *table, struct mapped_device **result)
+{
+ int r;
+ struct mapped_device *md;
+
+ md = alloc_dev(minor);
+ if (!md)
+ return -ENXIO;
+
+ r = __bind(md, table);
+ if (r) {
+ free_dev(md);
+ return r;
+ }
+
+ *result = md;
+ return 0;
+}
+
+void dm_get(struct mapped_device *md)
+{
+ atomic_inc(&md->holders);
+}
+
+void dm_put(struct mapped_device *md)
+{
+ if (atomic_dec_and_test(&md->holders)) {
+ DMWARN("destroying md");
+ __unbind(md);
+ free_dev(md);
+ }
+}
+
+/*
+ * Requeue the deferred io by calling generic_make_request.
+ */
+static void flush_deferred_io(struct deferred_io *c)
+{
+ struct deferred_io *n;
+
+ while (c) {
+ n = c->next;
+ generic_make_request(c->rw, c->bh);
+ free_deferred(c);
+ c = n;
+ }
+}
+
+/*
+ * Swap in a new table (destroying old one).
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+{
+ int r;
+
+ down_write(&md->lock);
+
+ /* device must be suspended */
+ if (!test_bit(DMF_SUSPENDED, &md->flags)) {
+ up_write(&md->lock);
+ return -EPERM;
+ }
+
+ __unbind(md);
+ r = __bind(md, table);
+ if (r)
+ return r;
+
+ up_write(&md->lock);
+ return 0;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem. For example we might want to move some data in
+ * the background. Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight io and ensure that any further io gets deferred.
+ */
+int dm_suspend(struct mapped_device *md)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ down_write(&md->lock);
+
+ /*
+ * First we set the BLOCK_IO flag so no more ios will be
+ * mapped.
+ */
+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ up_write(&md->lock);
+ return -EINVAL;
+ }
+
+ set_bit(DMF_BLOCK_IO, &md->flags);
+ up_write(&md->lock);
+
+ /*
+ * Then we wait for the already mapped ios to
+ * complete.
+ */
+ down_read(&md->lock);
+
+ add_wait_queue(&md->wait, &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!atomic_read(&md->pending))
+ break;
+
+ schedule();
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&md->wait, &wait);
+ up_read(&md->lock);
+
+ /* set_bit is atomic */
+ set_bit(DMF_SUSPENDED, &md->flags);
+
+ return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+ struct deferred_io *def;
+
+ down_write(&md->lock);
+ if (!test_bit(DMF_SUSPENDED, &md->flags) ||
+ !dm_table_get_size(md->map)) {
+ up_write(&md->lock);
+ return -EINVAL;
+ }
+
+ clear_bit(DMF_SUSPENDED, &md->flags);
+ clear_bit(DMF_BLOCK_IO, &md->flags);
+ def = md->deferred;
+ md->deferred = NULL;
+ up_write(&md->lock);
+
+ flush_deferred_io(def);
+ run_task_queue(&tq_disk);
+
+ return 0;
+}
+
+struct dm_table *dm_get_table(struct mapped_device *md)
+{
+ struct dm_table *t;
+
+ down_read(&md->lock);
+ t = md->map;
+ dm_table_get(t);
+ up_read(&md->lock);
+
+ return t;
+}
+
+kdev_t dm_kdev(struct mapped_device *md)
+{
+ kdev_t dev;
+
+ down_read(&md->lock);
+ dev = md->dev;
+ up_read(&md->lock);
+
+ return dev;
+}
+
+int dm_suspended(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED, &md->flags);
+}
+
+struct block_device_operations dm_blk_dops = {
+ .open = dm_blk_open,
+ .release = dm_blk_close,
+ .ioctl = dm_blk_ioctl,
+ .owner = THIS_MODULE
+};
+
+/*
+ * module hooks
+ */
+module_init(dm_init);
+module_exit(dm_exit);
+
+MODULE_PARM(major, "i");
+MODULE_PARM_DESC(major, "The major number of the device mapper");
+MODULE_DESCRIPTION(DM_NAME " driver");
+MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
+MODULE_LICENSE("GPL");
--- diff/drivers/md/dm.h 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/dm.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,151 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2001, 2002 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_INTERNAL_H
+#define DM_INTERNAL_H
+
+#include <linux/fs.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/blkdev.h>
+
+#define DM_NAME "device-mapper"
+#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
+#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
+#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+
+/*
+ * FIXME: I think this should be with the definition of sector_t
+ * in types.h.
+ */
+#ifdef CONFIG_LBD
+#define SECTOR_FORMAT "%Lu"
+#else
+#define SECTOR_FORMAT "%lu"
+#endif
+
+extern struct block_device_operations dm_blk_dops;
+
+/*
+ * List of devices that a metadevice uses and should open/close.
+ */
+struct dm_dev {
+ struct list_head list;
+
+ atomic_t count;
+ int mode;
+ kdev_t dev;
+ struct block_device *bdev;
+};
+
+struct dm_table;
+struct mapped_device;
+
+/*-----------------------------------------------------------------
+ * Functions for manipulating a struct mapped_device.
+ * Drop the reference with dm_put when you finish with the object.
+ *---------------------------------------------------------------*/
+int dm_create(int minor, struct dm_table *table, struct mapped_device **md);
+
+/*
+ * Reference counting for md.
+ */
+void dm_get(struct mapped_device *md);
+void dm_put(struct mapped_device *md);
+
+/*
+ * A device can still be used while suspended, but I/O is deferred.
+ */
+int dm_suspend(struct mapped_device *md);
+int dm_resume(struct mapped_device *md);
+
+/*
+ * The device must be suspended before calling this method.
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
+
+/*
+ * Drop a reference on the table when you've finished with the
+ * result.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md);
+
+/*
+ * Info functions.
+ */
+kdev_t dm_kdev(struct mapped_device *md);
+int dm_suspended(struct mapped_device *md);
+
+/*-----------------------------------------------------------------
+ * Functions for manipulating a table. Tables are also reference
+ * counted.
+ *---------------------------------------------------------------*/
+int dm_table_create(struct dm_table **result, int mode);
+
+void dm_table_get(struct dm_table *t);
+void dm_table_put(struct dm_table *t);
+
+int dm_table_add_target(struct dm_table *t, const char *type,
+ sector_t start, sector_t len, char *params);
+int dm_table_complete(struct dm_table *t);
+void dm_table_event(struct dm_table *t);
+sector_t dm_table_get_size(struct dm_table *t);
+struct dm_target *dm_table_get_target(struct dm_table *t, int index);
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+unsigned int dm_table_get_num_targets(struct dm_table *t);
+struct list_head *dm_table_get_devices(struct dm_table *t);
+int dm_table_get_mode(struct dm_table *t);
+void dm_table_add_wait_queue(struct dm_table *t, wait_queue_t *wq);
+
+/*-----------------------------------------------------------------
+ * A registry of target types.
+ *---------------------------------------------------------------*/
+int dm_target_init(void);
+void dm_target_exit(void);
+struct target_type *dm_get_target_type(const char *name);
+void dm_put_target_type(struct target_type *t);
+
+
+/*-----------------------------------------------------------------
+ * Useful inlines.
+ *---------------------------------------------------------------*/
+static inline int array_too_big(unsigned long fixed, unsigned long obj,
+ unsigned long num)
+{
+ return (num > (ULONG_MAX - fixed) / obj);
+}
+
+/*
+ * ceiling(n / size) * size
+ */
+static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
+{
+ unsigned long r = n % size;
+ return n + (r ? (size - r) : 0);
+}
+
+/*
+ * The device-mapper can be driven through one of two interfaces;
+ * ioctl or filesystem, depending which patch you have applied.
+ */
+int dm_interface_init(void);
+void dm_interface_exit(void);
+
+/*
+ * Targets for linear and striped mappings
+ */
+int dm_linear_init(void);
+void dm_linear_exit(void);
+
+int dm_stripe_init(void);
+void dm_stripe_exit(void);
+
+int dm_snapshot_init(void);
+void dm_snapshot_exit(void);
+
+#endif
--- diff/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/kcopyd.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,843 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <asm/atomic.h>
+
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/locks.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "kcopyd.h"
+
+/* FIXME: this is only needed for the DMERR macros */
+#include "dm.h"
+
+/*
+ * Hard sector size used all over the kernel.
+ */
+#define SECTOR_SIZE 512
+#define SECTOR_SHIFT 9
+
+static void wake_kcopyd(void);
+
+/*-----------------------------------------------------------------
+ * We reserve our own pool of preallocated pages that are
+ * only used for kcopyd io.
+ *---------------------------------------------------------------*/
+
+/*
+ * FIXME: This should be configurable.
+ */
+#define NUM_PAGES 512
+
+static DECLARE_MUTEX(_pages_lock);
+static int _num_free_pages;
+static struct page *_pages_array[NUM_PAGES];
+static DECLARE_MUTEX(start_lock);
+
+static int init_pages(void)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < NUM_PAGES; i++) {
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ goto bad;
+
+ LockPage(p);
+ _pages_array[i] = p;
+ }
+
+ _num_free_pages = NUM_PAGES;
+ return 0;
+
+ bad:
+ while (i--)
+ __free_page(_pages_array[i]);
+ return -ENOMEM;
+}
+
+static void exit_pages(void)
+{
+ int i;
+ struct page *p;
+
+ for (i = 0; i < NUM_PAGES; i++) {
+ p = _pages_array[i];
+ UnlockPage(p);
+ __free_page(p);
+ }
+
+ _num_free_pages = 0;
+}
+
+static int kcopyd_get_pages(int num, struct page **result)
+{
+ int i;
+
+ down(&_pages_lock);
+ if (_num_free_pages < num) {
+ up(&_pages_lock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < num; i++) {
+ _num_free_pages--;
+ result[i] = _pages_array[_num_free_pages];
+ }
+ up(&_pages_lock);
+
+ return 0;
+}
+
+static void kcopyd_free_pages(int num, struct page **result)
+{
+ int i;
+
+ down(&_pages_lock);
+ for (i = 0; i < num; i++)
+ _pages_array[_num_free_pages++] = result[i];
+ up(&_pages_lock);
+}
+
+/*-----------------------------------------------------------------
+ * We keep our own private pool of buffer_heads. These are just
+ * held in a list on the b_reqnext field.
+ *---------------------------------------------------------------*/
+
+/*
+ * Make sure we have enough buffers to always keep the pages
+ * occupied. So we assume the worst case scenario where blocks
+ * are the size of a single sector.
+ */
+#define NUM_BUFFERS NUM_PAGES * (PAGE_SIZE / SECTOR_SIZE)
+
+static spinlock_t _buffer_lock = SPIN_LOCK_UNLOCKED;
+static struct buffer_head *_all_buffers;
+static struct buffer_head *_free_buffers;
+
+static int init_buffers(void)
+{
+ int i;
+ struct buffer_head *buffers;
+
+ buffers = vcalloc(NUM_BUFFERS, sizeof(struct buffer_head));
+ if (!buffers) {
+ DMWARN("Couldn't allocate buffer heads.");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NUM_BUFFERS; i++) {
+ if (i < NUM_BUFFERS - 1)
+ buffers[i].b_reqnext = &buffers[i + 1];
+ init_waitqueue_head(&buffers[i].b_wait);
+ INIT_LIST_HEAD(&buffers[i].b_inode_buffers);
+ }
+
+ _all_buffers = _free_buffers = buffers;
+ return 0;
+}
+
+static void exit_buffers(void)
+{
+ vfree(_all_buffers);
+}
+
+static struct buffer_head *alloc_buffer(void)
+{
+ struct buffer_head *r;
+ int flags;
+
+ spin_lock_irqsave(&_buffer_lock, flags);
+
+ if (!_free_buffers)
+ r = NULL;
+ else {
+ r = _free_buffers;
+ _free_buffers = _free_buffers->b_reqnext;
+ r->b_reqnext = NULL;
+ }
+
+ spin_unlock_irqrestore(&_buffer_lock, flags);
+
+ return r;
+}
+
+/*
+ * Only called from interrupt context.
+ */
+static void free_buffer(struct buffer_head *bh)
+{
+ int flags, was_empty;
+
+ spin_lock_irqsave(&_buffer_lock, flags);
+ was_empty = (_free_buffers == NULL) ? 1 : 0;
+ bh->b_reqnext = _free_buffers;
+ _free_buffers = bh;
+ spin_unlock_irqrestore(&_buffer_lock, flags);
+
+ /*
+ * If the buffer list was empty then kcopyd probably went
+ * to sleep because it ran out of buffer heads, so let's
+ * wake it up.
+ */
+ if (was_empty)
+ wake_kcopyd();
+}
+
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a
+ * deadlock).
+ *---------------------------------------------------------------*/
+#define MIN_JOBS NUM_PAGES
+
+static kmem_cache_t *_job_cache = NULL;
+static mempool_t *_job_pool = NULL;
+
+/*
+ * We maintain three lists of jobs:
+ *
+ * i) jobs waiting for pages
+ * ii) jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+
+static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(_complete_jobs);
+static LIST_HEAD(_io_jobs);
+static LIST_HEAD(_pages_jobs);
+
+static int init_jobs(void)
+{
+ INIT_LIST_HEAD(&_complete_jobs);
+ INIT_LIST_HEAD(&_io_jobs);
+ INIT_LIST_HEAD(&_pages_jobs);
+
+ _job_cache = kmem_cache_create("kcopyd-jobs", sizeof(struct kcopyd_job),
+ __alignof__(struct kcopyd_job),
+ 0, NULL, NULL);
+ if (!_job_cache)
+ return -ENOMEM;
+
+ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
+ mempool_free_slab, _job_cache);
+ if (!_job_pool) {
+ kmem_cache_destroy(_job_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void exit_jobs(void)
+{
+ mempool_destroy(_job_pool);
+ kmem_cache_destroy(_job_cache);
+}
+
+struct kcopyd_job *kcopyd_alloc_job(void)
+{
+ struct kcopyd_job *job;
+
+ job = mempool_alloc(_job_pool, GFP_NOIO);
+ if (!job)
+ return NULL;
+
+ memset(job, 0, sizeof(*job));
+ return job;
+}
+
+void kcopyd_free_job(struct kcopyd_job *job)
+{
+ mempool_free(job, _job_pool);
+}
+
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static inline struct kcopyd_job *pop(struct list_head *jobs)
+{
+ struct kcopyd_job *job = NULL;
+ int flags;
+
+ spin_lock_irqsave(&_job_lock, flags);
+
+ if (!list_empty(jobs)) {
+ job = list_entry(jobs->next, struct kcopyd_job, list);
+ list_del(&job->list);
+ }
+ spin_unlock_irqrestore(&_job_lock, flags);
+
+ return job;
+}
+
+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+ int flags;
+
+ spin_lock_irqsave(&_job_lock, flags);
+ list_add(&job->list, jobs);
+ spin_unlock_irqrestore(&_job_lock, flags);
+}
+
+/*
+ * Completion function for one of our buffers.
+ */
+static void end_bh(struct buffer_head *bh, int uptodate)
+{
+ struct kcopyd_job *job = bh->b_private;
+
+ mark_buffer_uptodate(bh, uptodate);
+ unlock_buffer(bh);
+
+ if (!uptodate)
+ job->err = -EIO;
+
+ /* are we the last ? */
+ if (atomic_dec_and_test(&job->nr_incomplete)) {
+ push(&_complete_jobs, job);
+ wake_kcopyd();
+ }
+
+ free_buffer(bh);
+}
+
+static void dispatch_bh(struct kcopyd_job *job,
+ struct buffer_head *bh, int block)
+{
+ int p;
+
+ /*
+ * Add in the job offset
+ */
+ bh->b_blocknr = (job->disk.sector >> job->block_shift) + block;
+
+ p = block >> job->bpp_shift;
+ block &= job->bpp_mask;
+
+ bh->b_dev = B_FREE;
+ bh->b_size = job->block_size;
+ set_bh_page(bh, job->pages[p], ((block << job->block_shift) +
+ job->offset) << SECTOR_SHIFT);
+ bh->b_this_page = bh;
+
+ init_buffer(bh, end_bh, job);
+
+ bh->b_dev = job->disk.dev;
+ bh->b_state = ((1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req));
+
+ set_bit(BH_Uptodate, &bh->b_state);
+ if (job->rw == WRITE)
+ clear_bit(BH_Dirty, &bh->b_state);
+
+ submit_bh(job->rw, bh);
+}
+
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * < 0: error
+ * 0: success
+ * > 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+ job->callback(job);
+ return 0;
+}
+
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+ unsigned int block;
+ struct buffer_head *bh;
+
+ for (block = atomic_read(&job->nr_requested);
+ block < job->nr_blocks; block++) {
+ bh = alloc_buffer();
+ if (!bh)
+ break;
+
+ atomic_inc(&job->nr_requested);
+ dispatch_bh(job, bh, block);
+ }
+
+ return (block == job->nr_blocks) ? 0 : 1;
+}
+
+static int run_pages_job(struct kcopyd_job *job)
+{
+ int r;
+
+ job->nr_pages = (job->disk.count + job->offset) /
+ (PAGE_SIZE / SECTOR_SIZE);
+ r = kcopyd_get_pages(job->nr_pages, job->pages);
+
+ if (!r) {
+ /* this job is ready for io */
+ push(&_io_jobs, job);
+ return 0;
+ }
+
+ if (r == -ENOMEM)
+ /* can complete now */
+ return 1;
+
+ return r;
+}
+
+/*
+ * Run through a list for as long as possible. Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
+{
+ struct kcopyd_job *job;
+ int r, count = 0;
+
+ while ((job = pop(jobs))) {
+
+ r = fn(job);
+
+ if (r < 0) {
+ /* error this rogue job */
+ job->err = r;
+ push(&_complete_jobs, job);
+ break;
+ }
+
+ if (r > 0) {
+ /*
+ * We couldn't service this job ATM, so
+ * push this job back onto the list.
+ */
+ push(jobs, job);
+ break;
+ }
+
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(void)
+{
+ int count;
+
+ /*
+ * We loop round until there is no more work to do.
+ */
+ do {
+ count = process_jobs(&_complete_jobs, run_complete_job);
+ count += process_jobs(&_io_jobs, run_io_job);
+ count += process_jobs(&_pages_jobs, run_pages_job);
+
+ } while (count);
+
+ run_task_queue(&tq_disk);
+}
+
+/*-----------------------------------------------------------------
+ * The daemon
+ *---------------------------------------------------------------*/
+static atomic_t _kcopyd_must_die;
+static DECLARE_MUTEX(_run_lock);
+static DECLARE_WAIT_QUEUE_HEAD(_job_queue);
+
+static int kcopyd(void *arg)
+{
+ DECLARE_WAITQUEUE(wq, current);
+
+ daemonize();
+ strcpy(current->comm, "kcopyd");
+ atomic_set(&_kcopyd_must_die, 0);
+
+ add_wait_queue(&_job_queue, &wq);
+
+ down(&_run_lock);
+ up(&start_lock);
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (atomic_read(&_kcopyd_must_die))
+ break;
+
+ do_work();
+ schedule();
+ }
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&_job_queue, &wq);
+
+ up(&_run_lock);
+
+ return 0;
+}
+
+static int start_daemon(void)
+{
+ static pid_t pid = 0;
+
+ down(&start_lock);
+
+ pid = kernel_thread(kcopyd, NULL, 0);
+ if (pid <= 0) {
+ DMERR("Failed to start kcopyd thread");
+ return -EAGAIN;
+ }
+
+ /*
+ * wait for the daemon to up this mutex.
+ */
+ down(&start_lock);
+ up(&start_lock);
+
+ return 0;
+}
+
+static int stop_daemon(void)
+{
+ atomic_set(&_kcopyd_must_die, 1);
+ wake_kcopyd();
+ down(&_run_lock);
+ up(&_run_lock);
+
+ return 0;
+}
+
+static void wake_kcopyd(void)
+{
+ wake_up_interruptible(&_job_queue);
+}
+
+static int calc_shift(unsigned int n)
+{
+ int s;
+
+ for (s = 0; n; s++, n >>= 1)
+ ;
+
+ return --s;
+}
+
+static void calc_block_sizes(struct kcopyd_job *job)
+{
+ job->block_size = get_hardsect_size(job->disk.dev);
+ job->block_shift = calc_shift(job->block_size / SECTOR_SIZE);
+ job->bpp_shift = PAGE_SHIFT - job->block_shift - SECTOR_SHIFT;
+ job->bpp_mask = (1 << job->bpp_shift) - 1;
+ job->nr_blocks = job->disk.count >> job->block_shift;
+ atomic_set(&job->nr_requested, 0);
+ atomic_set(&job->nr_incomplete, job->nr_blocks);
+}
+
+int kcopyd_io(struct kcopyd_job *job)
+{
+ calc_block_sizes(job);
+ push(job->pages[0] ? &_io_jobs : &_pages_jobs, job);
+ wake_kcopyd();
+ return 0;
+}
+
+/*-----------------------------------------------------------------
+ * The copier is implemented on top of the simpler async io
+ * daemon above.
+ *---------------------------------------------------------------*/
+struct copy_info {
+ kcopyd_notify_fn notify;
+ void *notify_context;
+
+ struct kcopyd_region to;
+};
+
+#define MIN_INFOS 128
+static kmem_cache_t *_copy_cache = NULL;
+static mempool_t *_copy_pool = NULL;
+
+static int init_copier(void)
+{
+ _copy_cache = kmem_cache_create("kcopyd-info",
+ sizeof(struct copy_info),
+ __alignof__(struct copy_info),
+ 0, NULL, NULL);
+ if (!_copy_cache)
+ return -ENOMEM;
+
+ _copy_pool = mempool_create(MIN_INFOS, mempool_alloc_slab,
+ mempool_free_slab, _copy_cache);
+ if (!_copy_pool) {
+ kmem_cache_destroy(_copy_cache);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void exit_copier(void)
+{
+ if (_copy_pool)
+ mempool_destroy(_copy_pool);
+
+ if (_copy_cache)
+ kmem_cache_destroy(_copy_cache);
+}
+
+static inline struct copy_info *alloc_copy_info(void)
+{
+ return mempool_alloc(_copy_pool, GFP_NOIO);
+}
+
+static inline void free_copy_info(struct copy_info *info)
+{
+ mempool_free(info, _copy_pool);
+}
+
+void copy_complete(struct kcopyd_job *job)
+{
+ struct copy_info *info = (struct copy_info *) job->context;
+
+ if (info->notify)
+ info->notify(job->err, info->notify_context);
+
+ free_copy_info(info);
+
+ kcopyd_free_pages(job->nr_pages, job->pages);
+
+ kcopyd_free_job(job);
+}
+
+static void page_write_complete(struct kcopyd_job *job)
+{
+ struct copy_info *info = (struct copy_info *) job->context;
+ int i;
+
+ if (info->notify)
+ info->notify(job->err, info->notify_context);
+
+ free_copy_info(info);
+ for (i = 0; i < job->nr_pages; i++)
+ put_page(job->pages[i]);
+
+ kcopyd_free_job(job);
+}
+
+/*
+ * These callback functions implement the state machine that copies regions.
+ */
+void copy_write(struct kcopyd_job *job)
+{
+ struct copy_info *info = (struct copy_info *) job->context;
+
+ if (job->err && info->notify) {
+ info->notify(job->err, job->context);
+ kcopyd_free_job(job);
+ free_copy_info(info);
+ return;
+ }
+
+ job->rw = WRITE;
+ memcpy(&job->disk, &info->to, sizeof(job->disk));
+ job->callback = copy_complete;
+ job->context = info;
+
+ /*
+ * Queue the write.
+ */
+ kcopyd_io(job);
+}
+
+int kcopyd_write_pages(struct kcopyd_region *to, int nr_pages,
+ struct page **pages, int offset, kcopyd_notify_fn fn,
+ void *context)
+{
+ struct copy_info *info;
+ struct kcopyd_job *job;
+ int i;
+
+ /*
+ * Allocate a new copy_info.
+ */
+ info = alloc_copy_info();
+ if (!info)
+ return -ENOMEM;
+
+ job = kcopyd_alloc_job();
+ if (!job) {
+ free_copy_info(info);
+ return -ENOMEM;
+ }
+
+ /*
+ * set up for the write.
+ */
+ info->notify = fn;
+ info->notify_context = context;
+ memcpy(&info->to, to, sizeof(*to));
+
+ /* Get the pages */
+ job->nr_pages = nr_pages;
+ for (i = 0; i < nr_pages; i++) {
+ get_page(pages[i]);
+ job->pages[i] = pages[i];
+ }
+
+ job->rw = WRITE;
+
+ memcpy(&job->disk, &info->to, sizeof(job->disk));
+ job->offset = offset;
+ calc_block_sizes(job);
+ job->callback = page_write_complete;
+ job->context = info;
+
+ /*
+ * Trigger job.
+ */
+ kcopyd_io(job);
+ return 0;
+}
+
+int kcopyd_copy(struct kcopyd_region *from, struct kcopyd_region *to,
+ kcopyd_notify_fn fn, void *context)
+{
+ struct copy_info *info;
+ struct kcopyd_job *job;
+
+ /*
+ * Allocate a new copy_info.
+ */
+ info = alloc_copy_info();
+ if (!info)
+ return -ENOMEM;
+
+ job = kcopyd_alloc_job();
+ if (!job) {
+ free_copy_info(info);
+ return -ENOMEM;
+ }
+
+ /*
+ * set up for the read.
+ */
+ info->notify = fn;
+ info->notify_context = context;
+ memcpy(&info->to, to, sizeof(*to));
+
+ job->rw = READ;
+ memcpy(&job->disk, from, sizeof(*from));
+
+ job->offset = 0;
+ calc_block_sizes(job);
+ job->callback = copy_write;
+ job->context = info;
+
+ /*
+ * Trigger job.
+ */
+ kcopyd_io(job);
+ return 0;
+}
+
+/*-----------------------------------------------------------------
+ * Unit setup
+ *---------------------------------------------------------------*/
+static struct {
+ int (*init) (void);
+ void (*exit) (void);
+
+} _inits[] = {
+#define xx(n) { init_ ## n, exit_ ## n}
+ xx(pages),
+ xx(buffers),
+ xx(jobs),
+ xx(copier)
+#undef xx
+};
+
+static int _client_count = 0;
+static DECLARE_MUTEX(_client_count_sem);
+
+static int kcopyd_init(void)
+{
+ const int count = sizeof(_inits) / sizeof(*_inits);
+
+ int r, i;
+
+ for (i = 0; i < count; i++) {
+ r = _inits[i].init();
+ if (r)
+ goto bad;
+ }
+
+ start_daemon();
+ return 0;
+
+ bad:
+ while (i--)
+ _inits[i].exit();
+
+ return r;
+}
+
+static void kcopyd_exit(void)
+{
+ int i = sizeof(_inits) / sizeof(*_inits);
+
+ if (stop_daemon())
+ DMWARN("Couldn't stop kcopyd.");
+
+ while (i--)
+ _inits[i].exit();
+}
+
+void kcopyd_inc_client_count(void)
+{
+ /*
+ * What I need here is an atomic_test_and_inc that returns
+ * the previous value of the atomic... In its absence I lock
+ * an int with a semaphore. :-(
+ */
+ down(&_client_count_sem);
+ if (_client_count == 0)
+ kcopyd_init();
+ _client_count++;
+
+ up(&_client_count_sem);
+}
+
+void kcopyd_dec_client_count(void)
+{
+ down(&_client_count_sem);
+ if (--_client_count == 0)
+ kcopyd_exit();
+
+ up(&_client_count_sem);
+}
--- diff/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100
+++ source/drivers/md/kcopyd.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2001 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_KCOPYD_H
+#define DM_KCOPYD_H
+
+/*
+ * Needed for the definition of offset_t.
+ */
+#include <linux/device-mapper.h>
+#include <linux/iobuf.h>
+
+struct kcopyd_region {
+ kdev_t dev;
+ sector_t sector;
+ sector_t count;
+};
+
+#define MAX_KCOPYD_PAGES 128
+
+struct kcopyd_job {
+ struct list_head list;
+
+ /*
+ * Error state of the job.
+ */
+ int err;
+
+ /*
+ * Either READ or WRITE
+ */
+ int rw;
+
+ /*
+ * The source or destination for the transfer.
+ */
+ struct kcopyd_region disk;
+
+ int nr_pages;
+ struct page *pages[MAX_KCOPYD_PAGES];
+
+ /*
+ * Shifts and masks that will be useful when dispatching
+ * each buffer_head.
+ */
+ sector_t offset;
+ sector_t block_size;
+ sector_t block_shift;
+ sector_t bpp_shift; /* blocks per page */
+ sector_t bpp_mask;
+
+ /*
+ * nr_blocks is how many buffer heads will have to be
+ * displatched to service this job, nr_requested is how
+ * many have been dispatched and nr_complete is how many
+ * have come back.
+ */
+ unsigned int nr_blocks;
+ atomic_t nr_requested;
+ atomic_t nr_incomplete;
+
+ /*
+ * Set this to ensure you are notified when the job has
+ * completed. 'context' is for callback to use.
+ */
+ void (*callback)(struct kcopyd_job *job);
+ void *context;
+};
+
+/*
+ * Low level async io routines.
+ */
+struct kcopyd_job *kcopyd_alloc_job(void);
+void kcopyd_free_job(struct kcopyd_job *job);
+
+int kcopyd_queue_job(struct kcopyd_job *job);
+
+/*
+ * Submit a copy job to kcopyd. This is built on top of the
+ * previous three fns.
+ */
+typedef void (*kcopyd_notify_fn)(int err, void *context);
+
+int kcopyd_copy(struct kcopyd_region *from, struct kcopyd_region *to,
+ kcopyd_notify_fn fn, void *context);
+
+int kcopyd_write_pages(struct kcopyd_region *to, int nr_pages,
+ struct page **pages, int offset, kcopyd_notify_fn fn,
+ void *context);
+
+/*
+ * We only want kcopyd to reserve resources if someone is
+ * actually using it.
+ */
+void kcopyd_inc_client_count(void);
+void kcopyd_dec_client_count(void);
+
+#endif
--- diff/include/linux/device-mapper.h 1970-01-01 01:00:00.000000000 +0100
+++ source/include/linux/device-mapper.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef _LINUX_DEVICE_MAPPER_H
+#define _LINUX_DEVICE_MAPPER_H
+
+#define DM_DIR "mapper" /* Slashes not supported */
+#define DM_MAX_TYPE_NAME 16
+#define DM_NAME_LEN 128
+#define DM_UUID_LEN 129
+
+#ifdef __KERNEL__
+
+typedef unsigned long sector_t;
+
+struct dm_target;
+struct dm_table;
+struct dm_dev;
+
+typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
+
+/*
+ * In the constructor the target parameter will already have the
+ * table, type, begin and len fields filled in.
+ */
+typedef int (*dm_ctr_fn) (struct dm_target *target, int argc, char **argv);
+
+/*
+ * The destructor doesn't need to free the dm_target, just
+ * anything hidden ti->private.
+ */
+typedef void (*dm_dtr_fn) (struct dm_target *ti);
+
+/*
+ * The map function must return:
+ * < 0: error
+ * = 0: The target will handle the io by resubmitting it later
+ * > 0: simple remap complete
+ */
+typedef int (*dm_map_fn) (struct dm_target *ti, struct buffer_head *bh, int rw);
+typedef int (*dm_status_fn) (struct dm_target *ti, status_type_t status_type,
+ char *result, int maxlen);
+
+void dm_error(const char *message);
+
+/*
+ * Constructors should call these functions to ensure destination devices
+ * are opened/closed correctly.
+ * FIXME: too many arguments.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+ sector_t len, int mode, struct dm_dev **result);
+void dm_put_device(struct dm_target *ti, struct dm_dev *d);
+
+/*
+ * Information about a target type
+ */
+struct target_type {
+ const char *name;
+ struct module *module;
+ dm_ctr_fn ctr;
+ dm_dtr_fn dtr;
+ dm_map_fn map;
+ dm_status_fn status;
+};
+
+struct dm_target {
+ struct dm_table *table;
+ struct target_type *type;
+
+ /* target limits */
+ sector_t begin;
+ sector_t len;
+
+ /* target specific data */
+ void *private;
+
+ /* Used to provide an error string from the ctr */
+ char *error;
+};
+
+int dm_register_target(struct target_type *t);
+int dm_unregister_target(struct target_type *t);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_DEVICE_MAPPER_H */
--- diff/include/linux/dm-ioctl.h 1970-01-01 01:00:00.000000000 +0100
+++ source/include/linux/dm-ioctl.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef _LINUX_DM_IOCTL_H
+#define _LINUX_DM_IOCTL_H
+
+#include <linux/device-mapper.h>
+#include <linux/types.h>
+
+/*
+ * Implements a traditional ioctl interface to the device mapper.
+ */
+
+/*
+ * All ioctl arguments consist of a single chunk of memory, with
+ * this structure at the start. If a uuid is specified any
+ * lookup (eg. for a DM_INFO) will be done on that, *not* the
+ * name.
+ */
+struct dm_ioctl {
+ /*
+ * The version number is made up of three parts:
+ * major - no backward or forward compatibility,
+ * minor - only backwards compatible,
+ * patch - both backwards and forwards compatible.
+ *
+ * All clients of the ioctl interface should fill in the
+ * version number of the interface that they were
+ * compiled with.
+ *
+ * All recognised ioctl commands (ie. those that don't
+ * return -ENOTTY) fill out this field, even if the
+ * command failed.
+ */
+ uint32_t version[3]; /* in/out */
+ uint32_t data_size; /* total size of data passed in
+ * including this struct */
+
+ uint32_t data_start; /* offset to start of data
+ * relative to start of this struct */
+
+ uint32_t target_count; /* in/out */
+ uint32_t open_count; /* out */
+ uint32_t flags; /* in/out */
+
+ __kernel_dev_t dev; /* in/out */
+
+ char name[DM_NAME_LEN]; /* device name */
+ char uuid[DM_UUID_LEN]; /* unique identifier for
+ * the block device */
+};
+
+/*
+ * Used to specify tables. These structures appear after the
+ * dm_ioctl.
+ */
+struct dm_target_spec {
+ int32_t status; /* used when reading from kernel only */
+ uint64_t sector_start;
+ uint32_t length;
+
+ /*
+ * Offset in bytes (from the start of this struct) to
+ * next target_spec.
+ */
+ uint32_t next;
+
+ char target_type[DM_MAX_TYPE_NAME];
+
+ /*
+ * Parameter string starts immediately after this object.
+ * Be careful to add padding after string to ensure correct
+ * alignment of subsequent dm_target_spec.
+ */
+};
+
+/*
+ * Used to retrieve the target dependencies.
+ */
+struct dm_target_deps {
+ uint32_t count;
+
+ __kernel_dev_t dev[0]; /* out */
+};
+
+/*
+ * If you change this make sure you make the corresponding change
+ * to dm-ioctl.c:lookup_ioctl()
+ */
+enum {
+ /* Top level cmds */
+ DM_VERSION_CMD = 0,
+ DM_REMOVE_ALL_CMD,
+
+ /* device level cmds */
+ DM_DEV_CREATE_CMD,
+ DM_DEV_REMOVE_CMD,
+ DM_DEV_RELOAD_CMD,
+ DM_DEV_RENAME_CMD,
+ DM_DEV_SUSPEND_CMD,
+ DM_DEV_DEPS_CMD,
+ DM_DEV_STATUS_CMD,
+
+ /* target level cmds */
+ DM_TARGET_STATUS_CMD,
+ DM_TARGET_WAIT_CMD
+};
+
+#define DM_IOCTL 0xfd
+
+#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
+#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
+
+#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
+#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
+#define DM_DEV_RELOAD _IOWR(DM_IOCTL, DM_DEV_RELOAD_CMD, struct dm_ioctl)
+#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
+#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
+#define DM_DEV_DEPS _IOWR(DM_IOCTL, DM_DEV_DEPS_CMD, struct dm_ioctl)
+#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
+
+#define DM_TARGET_STATUS _IOWR(DM_IOCTL, DM_TARGET_STATUS_CMD, struct dm_ioctl)
+#define DM_TARGET_WAIT _IOWR(DM_IOCTL, DM_TARGET_WAIT_CMD, struct dm_ioctl)
+
+#define DM_VERSION_MAJOR 1
+#define DM_VERSION_MINOR 0
+#define DM_VERSION_PATCHLEVEL 6
+#define DM_VERSION_EXTRA "-ioctl (2002-10-15)"
+
+/* Status bits */
+#define DM_READONLY_FLAG 0x00000001
+#define DM_SUSPEND_FLAG 0x00000002
+#define DM_EXISTS_FLAG 0x00000004
+#define DM_PERSISTENT_DEV_FLAG 0x00000008
+
+/*
+ * Flag passed into ioctl STATUS command to get table information
+ * rather than current status.
+ */
+#define DM_STATUS_TABLE_FLAG 0x00000010
+
+#endif /* _LINUX_DM_IOCTL_H */
--- diff/include/linux/mempool.h 1970-01-01 01:00:00.000000000 +0100
+++ source/include/linux/mempool.h 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,41 @@
+/*
+ * memory buffer pool support
+ */
+#ifndef _LINUX_MEMPOOL_H
+#define _LINUX_MEMPOOL_H
+
+#include <linux/list.h>
+#include <linux/wait.h>
+
+struct mempool_s;
+typedef struct mempool_s mempool_t;
+
+typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
+struct mempool_s {
+ spinlock_t lock;
+ int min_nr, curr_nr;
+ struct list_head elements;
+
+ void *pool_data;
+ mempool_alloc_t *alloc;
+ mempool_free_t *free;
+ wait_queue_head_t wait;
+};
+extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data);
+extern void mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
+extern void mempool_destroy(mempool_t *pool);
+extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
+extern void mempool_free(void *element, mempool_t *pool);
+
+
+/*
+ * A mempool_alloc_t and mempool_free_t that get the memory from
+ * a slab that is passed in through pool_data.
+ */
+void *mempool_alloc_slab(int gfp_mask, void *pool_data);
+void mempool_free_slab(void *element, void *pool_data);
+
+#endif /* _LINUX_MEMPOOL_H */
--- diff/mm/mempool.c 1970-01-01 01:00:00.000000000 +0100
+++ source/mm/mempool.c 2002-11-29 09:44:59.000000000 +0000
@@ -0,0 +1,295 @@
+/*
+ * linux/mm/mempool.c
+ *
+ * memory buffer pool support. Such pools are mostly used
+ * for guaranteed, deadlock-free memory allocations during
+ * extreme VM load.
+ *
+ * started by Ingo Molnar, Copyright (C) 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mempool.h>
+#include <linux/compiler.h>
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr: the minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @alloc_fn: user-defined element-allocation function.
+ * @free_fn: user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc function is not called
+ * from IRQ contexts. The element allocated by alloc_fn() must be able to
+ * hold a struct list_head. (8 bytes on x86.)
+ */
+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
+{
+ mempool_t *pool;
+ int i;
+
+ pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return NULL;
+ memset(pool, 0, sizeof(*pool));
+
+ spin_lock_init(&pool->lock);
+ pool->min_nr = min_nr;
+ pool->pool_data = pool_data;
+ INIT_LIST_HEAD(&pool->elements);
+ init_waitqueue_head(&pool->wait);
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+ */
+ for (i = 0; i < min_nr; i++) {
+ void *element;
+ struct list_head *tmp;
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
+
+ if (unlikely(!element)) {
+ /*
+ * Not enough memory - free the allocated ones
+ * and return:
+ */
+ list_for_each(tmp, &pool->elements) {
+ element = tmp;
+ pool->free(element, pool->pool_data);
+ }
+ kfree(pool);
+
+ return NULL;
+ }
+ tmp = element;
+ list_add(tmp, &pool->elements);
+ pool->curr_nr++;
+ }
+ return pool;
+}
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ * allocated for this pool.
+ * @gfp_mask: the usual allocation bitmask.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
+{
+ int delta;
+ void *element;
+ unsigned long flags;
+ struct list_head *tmp;
+
+ if (new_min_nr <= 0)
+ BUG();
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (new_min_nr < pool->min_nr) {
+ pool->min_nr = new_min_nr;
+ /*
+ * Free possible excess elements.
+ */
+ while (pool->curr_nr > pool->min_nr) {
+ tmp = pool->elements.next;
+ if (tmp == &pool->elements)
+ BUG();
+ list_del(tmp);
+ element = tmp;
+ pool->curr_nr--;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ pool->free(element, pool->pool_data);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return;
+ }
+ delta = new_min_nr - pool->min_nr;
+ pool->min_nr = new_min_nr;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /*
+ * We refill the pool up to the new treshold - but we dont
+ * (cannot) guarantee that the refill succeeds.
+ */
+ while (delta) {
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (!element)
+ break;
+ mempool_free(element, pool);
+ delta--;
+ }
+}
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps. The caller
+ * has to guarantee that no mempool_alloc() nor mempool_free() happens in
+ * this pool when calling this function.
+ */
+void mempool_destroy(mempool_t *pool)
+{
+ void *element;
+ struct list_head *head, *tmp;
+
+ if (!pool)
+ return;
+
+ head = &pool->elements;
+ for (tmp = head->next; tmp != head; ) {
+ element = tmp;
+ tmp = tmp->next;
+ pool->free(element, pool->pool_data);
+ pool->curr_nr--;
+ }
+ if (pool->curr_nr)
+ BUG();
+ kfree(pool);
+}
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ * @gfp_mask: the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ */
+void * mempool_alloc(mempool_t *pool, int gfp_mask)
+{
+ void *element;
+ unsigned long flags;
+ struct list_head *tmp;
+ int curr_nr;
+ DECLARE_WAITQUEUE(wait, current);
+ int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
+repeat_alloc:
+ element = pool->alloc(gfp_nowait, pool->pool_data);
+ if (likely(element != NULL))
+ return element;
+
+ /*
+ * If the pool is less than 50% full then try harder
+ * to allocate an element:
+ */
+ if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
+ element = pool->alloc(gfp_mask, pool->pool_data);
+ if (likely(element != NULL))
+ return element;
+ }
+
+ /*
+ * Kick the VM at this point.
+ */
+ wakeup_bdflush();
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (likely(pool->curr_nr)) {
+ tmp = pool->elements.next;
+ list_del(tmp);
+ element = tmp;
+ pool->curr_nr--;
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return element;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /* We must not sleep in the GFP_ATOMIC case */
+ if (gfp_mask == gfp_nowait)
+ return NULL;
+
+ run_task_queue(&tq_disk);
+
+ add_wait_queue_exclusive(&pool->wait, &wait);
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ curr_nr = pool->curr_nr;
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (!curr_nr)
+ schedule();
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&pool->wait, &wait);
+
+ goto repeat_alloc;
+}
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element: pool element pointer.
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+ unsigned long flags;
+
+ if (pool->curr_nr < pool->min_nr) {
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->curr_nr < pool->min_nr) {
+ list_add(element, &pool->elements);
+ pool->curr_nr++;
+ spin_unlock_irqrestore(&pool->lock, flags);
+ wake_up(&pool->wait);
+ return;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+ pool->free(element, pool->pool_data);
+}
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(int gfp_mask, void *pool_data)
+{
+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+ return kmem_cache_alloc(mem, gfp_mask);
+}
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+ kmem_cache_free(mem, element);
+}
+
+
+EXPORT_SYMBOL(mempool_create);
+EXPORT_SYMBOL(mempool_resize);
+EXPORT_SYMBOL(mempool_destroy);
+EXPORT_SYMBOL(mempool_alloc);
+EXPORT_SYMBOL(mempool_free);
+EXPORT_SYMBOL(mempool_alloc_slab);
+EXPORT_SYMBOL(mempool_free_slab);
+