qubes-linux-kernel/patches.suse/dm-raid45_2.6.27_20081027.patch
2010-07-07 13:12:45 +02:00

5607 lines
145 KiB
Diff

From: "Heinz Mauelshagen <hjm@redhat.de>
Subject: DMRAID45 module
X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/
Patch-mainline: not yet
DM-RAID 45 module.
This driver is used for "Fake RAID" devices.
Acked-by: Jeff Mahoney <jeffm@suse.com>
---
drivers/md/Kconfig | 15
drivers/md/Makefile | 4
drivers/md/dm-memcache.c | 301 ++
drivers/md/dm-memcache.h | 68
drivers/md/dm-message.c | 182 +
drivers/md/dm-message.h | 91
drivers/md/dm-raid45.c | 4523 +++++++++++++++++++++++++++++++++++++++++
drivers/md/dm-raid45.h | 28
drivers/md/dm-region-hash.c | 108
drivers/md/dm.c | 1
include/linux/dm-region-hash.h | 109
11 files changed, 5314 insertions(+), 116 deletions(-)
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -120,7 +120,6 @@ config MD_RAID10
config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode"
- depends on BLK_DEV_MD
select MD_RAID6_PQ
select ASYNC_MEMCPY
select ASYNC_XOR
@@ -249,9 +248,14 @@ config DM_SNAPSHOT
---help---
Allow volume managers to take writable snapshots of a device.
+config DM_RAID
+ tristate
+ depends on BLK_DEV_DM
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
+ select DM_RAID
---help---
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
@@ -313,6 +317,15 @@ config DM_DELAY
If unsure, say N.
+config DM_RAID45
+ tristate "RAID 4/5 target (EXPERIMENTAL)"
+ depends on DM_RAID
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ A target that supports RAID4 and RAID5 mappings.
+
+ If unsure, say N.
+
config DM_UEVENT
bool "DM uevents (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -41,7 +41,9 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipa
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
-obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_RAID) += dm-region-hash.o dm-log.o
+obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
+obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-memcache.o dm-message.o
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
--- /dev/null
+++ b/drivers/md/dm-memcache.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * Device-mapper memory object handling:
+ *
+ * o allocate/free total_pages in a per client page pool.
+ *
+ * o allocate/free memory objects with chunks (1..n) of
+ * pages_per_chunk pages hanging off.
+ *
+ * This file is released under the GPL.
+ */
+
+#define DM_MEM_CACHE_VERSION "0.2"
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+
+struct dm_mem_cache_client {
+ spinlock_t lock;
+ mempool_t *objs_pool;
+ struct page_list *free_list;
+ unsigned objects;
+ unsigned chunks;
+ unsigned pages_per_chunk;
+ unsigned free_pages;
+ unsigned total_pages;
+};
+
+/*
+ * Free pages and page_list elements of client.
+ */
+static void free_cache_pages(struct page_list *list)
+{
+ while (list) {
+ struct page_list *pl = list;
+
+ list = pl->next;
+ BUG_ON(!pl->page);
+ __free_page(pl->page);
+ kfree(pl);
+ }
+}
+
+/*
+ * Alloc number of pages and page_list elements as required by client.
+ */
+static struct page_list *alloc_cache_pages(unsigned pages)
+{
+ struct page_list *pl, *ret = NULL;
+ struct page *page;
+
+ while (pages--) {
+ page = alloc_page(GFP_NOIO);
+ if (!page)
+ goto err;
+
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl) {
+ __free_page(page);
+ goto err;
+ }
+
+ pl->page = page;
+ pl->next = ret;
+ ret = pl;
+ }
+
+ return ret;
+
+err:
+ free_cache_pages(ret);
+ return NULL;
+}
+
+/*
+ * Allocate page_list elements from the pool to chunks of the memory object.
+ */
+static void alloc_chunks(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ unsigned chunks = cl->chunks;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ local_irq_disable();
+ while (chunks--) {
+ unsigned p = cl->pages_per_chunk;
+
+ obj[chunks].pl = NULL;
+
+ while (p--) {
+ struct page_list *pl;
+
+ /* Take next element from free list */
+ spin_lock(&cl->lock);
+ pl = cl->free_list;
+ BUG_ON(!pl);
+ cl->free_list = pl->next;
+ spin_unlock(&cl->lock);
+
+ pl->next = obj[chunks].pl;
+ obj[chunks].pl = pl;
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Free page_list elements putting them back onto free list
+ */
+static void free_chunks(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ unsigned chunks = cl->chunks;
+ unsigned long flags;
+ struct page_list *next, *pl;
+
+ local_irq_save(flags);
+ local_irq_disable();
+ while (chunks--) {
+ for (pl = obj[chunks].pl; pl; pl = next) {
+ next = pl->next;
+
+ spin_lock(&cl->lock);
+ pl->next = cl->free_list;
+ cl->free_list = pl;
+ cl->free_pages++;
+ spin_unlock(&cl->lock);
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Create/destroy dm memory cache client resources.
+ */
+struct dm_mem_cache_client *
+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
+ unsigned pages_per_chunk)
+{
+ unsigned total_pages = objects * chunks * pages_per_chunk;
+ struct dm_mem_cache_client *client;
+
+ BUG_ON(!total_pages);
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return ERR_PTR(-ENOMEM);
+
+ client->objs_pool = mempool_create_kmalloc_pool(objects,
+ chunks * sizeof(struct dm_mem_cache_object));
+ if (!client->objs_pool)
+ goto err;
+
+ client->free_list = alloc_cache_pages(total_pages);
+ if (!client->free_list)
+ goto err1;
+
+ spin_lock_init(&client->lock);
+ client->objects = objects;
+ client->chunks = chunks;
+ client->pages_per_chunk = pages_per_chunk;
+ client->free_pages = client->total_pages = total_pages;
+ return client;
+
+err1:
+ mempool_destroy(client->objs_pool);
+err:
+ kfree(client);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_create);
+
+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
+{
+ BUG_ON(cl->free_pages != cl->total_pages);
+ free_cache_pages(cl->free_list);
+ mempool_destroy(cl->objs_pool);
+ kfree(cl);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_destroy);
+
+/*
+ * Grow a clients cache by an amount of pages.
+ *
+ * Don't call from interrupt context!
+ */
+int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
+{
+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
+ struct page_list *pl, *last;
+
+ BUG_ON(!pages);
+ pl = alloc_cache_pages(pages);
+ if (!pl)
+ return -ENOMEM;
+
+ last = pl;
+ while (last->next)
+ last = last->next;
+
+ spin_lock_irq(&cl->lock);
+ last->next = cl->free_list;
+ cl->free_list = pl;
+ cl->free_pages += pages;
+ cl->total_pages += pages;
+ cl->objects++;
+ spin_unlock_irq(&cl->lock);
+
+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+ return 0;
+}
+EXPORT_SYMBOL(dm_mem_cache_grow);
+
+/* Shrink a clients cache by an amount of pages */
+int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
+{
+ int r;
+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
+ unsigned long flags;
+ struct page_list *last = NULL, *pl, *pos;
+
+ BUG_ON(!pages);
+
+ spin_lock_irqsave(&cl->lock, flags);
+ pl = pos = cl->free_list;
+ while (p-- && pos->next) {
+ last = pos;
+ pos = pos->next;
+ }
+
+ if (++p)
+ r = -ENOMEM;
+ else {
+ r = 0;
+ cl->free_list = pos;
+ cl->free_pages -= pages;
+ cl->total_pages -= pages;
+ cl->objects--;
+ last->next = NULL;
+ }
+ spin_unlock_irqrestore(&cl->lock, flags);
+
+ if (!r) {
+ free_cache_pages(pl);
+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+ }
+
+ return r;
+}
+EXPORT_SYMBOL(dm_mem_cache_shrink);
+
+/*
+ * Allocate/free a memory object
+ *
+ * Can be called from interrupt context
+ */
+struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
+{
+ int r = 0;
+ unsigned pages = cl->chunks * cl->pages_per_chunk;
+ unsigned long flags;
+ struct dm_mem_cache_object *obj;
+
+ obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_irqsave(&cl->lock, flags);
+ if (pages > cl->free_pages)
+ r = -ENOMEM;
+ else
+ cl->free_pages -= pages;
+ spin_unlock_irqrestore(&cl->lock, flags);
+
+ if (r) {
+ mempool_free(obj, cl->objs_pool);
+ return ERR_PTR(r);
+ }
+
+ alloc_chunks(cl, obj);
+ return obj;
+}
+EXPORT_SYMBOL(dm_mem_cache_alloc);
+
+void dm_mem_cache_free(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ free_chunks(cl, obj);
+ mempool_free(obj, cl->objs_pool);
+}
+EXPORT_SYMBOL(dm_mem_cache_free);
+
+MODULE_DESCRIPTION(DM_NAME " dm memory cache");
+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
+MODULE_LICENSE("GPL");
--- /dev/null
+++ b/drivers/md/dm-memcache.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
+ *
+ * Device-mapper memory object handling:
+ *
+ * o allocate/free total_pages in a per client page pool.
+ *
+ * o allocate/free memory objects with chunks (1..n) of
+ * pages_per_chunk pages hanging off.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _DM_MEM_CACHE_H
+#define _DM_MEM_CACHE_H
+
+#define DM_MEM_CACHE_H_VERSION "0.1"
+
+#include "dm.h"
+#include <linux/dm-io.h>
+
+static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
+{
+ while (pl && p--)
+ pl = pl->next;
+
+ return pl;
+}
+
+struct dm_mem_cache_object {
+ struct page_list *pl; /* Dynamically allocated array */
+ void *private; /* Caller context reference */
+};
+
+struct dm_mem_cache_client;
+
+/*
+ * Create/destroy dm memory cache client resources.
+ *
+ * On creation, a number of @objects with @chunks of
+ * @pages_per_chunk pages will be allocated.
+ */
+struct dm_mem_cache_client *
+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
+ unsigned pages_per_chunk);
+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
+
+/*
+ * Grow/shrink a dm memory cache client resources
+ * by @objetcs amount of objects.
+ */
+int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
+int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
+
+/*
+ * Allocate/free a memory object
+ *
+ * On allocation one object with an amount of chunks and
+ * an amount of pages per chunk will be returned on success.
+ */
+struct dm_mem_cache_object *
+dm_mem_cache_alloc(struct dm_mem_cache_client *client);
+void dm_mem_cache_free(struct dm_mem_cache_client *client,
+ struct dm_mem_cache_object *object);
+
+#endif
--- /dev/null
+++ b/drivers/md/dm-message.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * General device-mapper message interface argument parser.
+ *
+ * This file is released under the GPL.
+ *
+ * device-mapper message parser.
+ *
+ */
+
+#include "dm.h"
+#include "dm-message.h"
+#include <linux/kernel.h>
+
+#define DM_MSG_PREFIX "dm_message"
+
+/* Basename of a path. */
+static inline char *
+basename(char *s)
+{
+ char *p = strrchr(s, '/');
+
+ return p ? p + 1 : s;
+}
+
+/* Get an argument depending on type. */
+static void
+message_arguments(struct dm_msg *msg, int argc, char **argv)
+{
+
+ if (argc) {
+ int i;
+ struct dm_message_argument *args = msg->spec->args;
+
+ for (i = 0; i < args->num_args; i++) {
+ int r;
+ unsigned long **ptr = args->ptr;
+ enum dm_message_argument_type type = args->types[i];
+
+ switch (type) {
+ case dm_msg_base_t:
+ ((char **) ptr)[i] = basename(argv[i]);
+ break;
+
+ case dm_msg_str_t:
+ ((char **) ptr)[i] = argv[i];
+ break;
+
+ case dm_msg_int_t:
+ r = sscanf(argv[i], "%d", ((int **) ptr)[i]);
+ goto check;
+
+ case dm_msg_uint_t:
+ r = sscanf(argv[i], "%u",
+ ((unsigned **) ptr)[i]);
+ goto check;
+
+ case dm_msg_uint64_t:
+ r = sscanf(argv[i], "%llu",
+ ((unsigned long long **) ptr)[i]);
+
+check:
+ if (r != 1) {
+ set_bit(dm_msg_ret_undef, &msg->ret);
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ }
+ }
+ }
+ }
+}
+
+/* Parse message options. */
+static void
+message_options_parse(struct dm_msg *msg, int argc, char **argv)
+{
+ int hit = 0;
+ unsigned long *action;
+ size_t l1 = strlen(*argv), l_hit = 0;
+ struct dm_message_option *o = msg->spec->options;
+ char **option, **option_end = o->options + o->num_options;
+
+ for (option = o->options, action = o->actions;
+ option < option_end; option++, action++) {
+ size_t l2 = strlen(*option);
+
+ if (!strnicmp(*argv, *option, min(l1, l2))) {
+ hit++;
+ l_hit = l2;
+ set_bit(*action, &msg->action);
+ }
+ }
+
+ /* Assume error. */
+ msg->ret = 0;
+ set_bit(dm_msg_ret_option, &msg->ret);
+ if (!hit || l1 > l_hit)
+ set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */
+ else if (hit > 1)
+ set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/
+ else {
+ clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */
+ message_arguments(msg, --argc, ++argv);
+ }
+}
+
+static inline void
+print_ret(const char *caller, unsigned long ret)
+{
+ struct {
+ unsigned long err;
+ const char *err_str;
+ } static err_msg[] = {
+ { dm_msg_ret_ambiguous, "message ambiguous" },
+ { dm_msg_ret_inval, "message invalid" },
+ { dm_msg_ret_undef, "message undefined" },
+ { dm_msg_ret_arg, "message argument" },
+ { dm_msg_ret_argcount, "message argument count" },
+ { dm_msg_ret_option, "option" },
+ }, *e = ARRAY_END(err_msg);
+
+ while (e-- > err_msg) {
+ if (test_bit(e->err, &ret))
+ DMERR("%s %s", caller, e->err_str);
+ }
+}
+
+/* Parse a message action. */
+int
+dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
+ int argc, char **argv)
+{
+ int hit = 0;
+ size_t l1 = strlen(*argv), l_hit = 0;
+ struct dm_msg_spec *s, *s_hit = NULL,
+ *s_end = msg->specs + msg->num_specs;
+
+ if (argc < 2)
+ return -EINVAL;
+
+ for (s = msg->specs; s < s_end; s++) {
+ size_t l2 = strlen(s->cmd);
+
+ if (!strnicmp(*argv, s->cmd, min(l1, l2))) {
+ hit++;
+ l_hit = l2;
+ s_hit = s;
+ }
+ }
+
+ msg->ret = 0;
+ if (!hit || l1 > l_hit) /* No hit or message string too long. */
+ set_bit(dm_msg_ret_undef, &msg->ret);
+ else if (hit > 1) /* Ambiguous message. */
+ set_bit(dm_msg_ret_ambiguous, &msg->ret);
+ else if (argc - 2 != s_hit->args->num_args) {
+ set_bit(dm_msg_ret_undef, &msg->ret);
+ set_bit(dm_msg_ret_argcount, &msg->ret);
+ }
+
+ if (msg->ret)
+ goto bad;
+
+ msg->action = 0;
+ msg->spec = s_hit;
+ set_bit(s_hit->action, &msg->action);
+ message_options_parse(msg, --argc, ++argv);
+
+ if (!msg->ret)
+ return msg->spec->f(msg, context);
+
+bad:
+ print_ret(caller, msg->ret);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(dm_message_parse);
+
+MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser");
+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
+MODULE_LICENSE("GPL");
--- /dev/null
+++ b/drivers/md/dm-message.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de>
+ *
+ * General device-mapper message interface argument parser.
+ *
+ * This file is released under the GPL.
+ *
+ */
+
+#ifndef DM_MESSAGE_H
+#define DM_MESSAGE_H
+
+/* Factor out to dm.h. */
+/* Reference to array end. */
+#define ARRAY_END(a) ((a) + ARRAY_SIZE(a))
+
+/* Message return bits. */
+enum dm_message_return {
+ dm_msg_ret_ambiguous, /* Action ambiguous. */
+ dm_msg_ret_inval, /* Action invalid. */
+ dm_msg_ret_undef, /* Action undefined. */
+
+ dm_msg_ret_option, /* Option error. */
+ dm_msg_ret_arg, /* Argument error. */
+ dm_msg_ret_argcount, /* Argument count error. */
+};
+
+/* Message argument type conversions. */
+enum dm_message_argument_type {
+ dm_msg_base_t, /* Basename string. */
+ dm_msg_str_t, /* String. */
+ dm_msg_int_t, /* Signed int. */
+ dm_msg_uint_t, /* Unsigned int. */
+ dm_msg_uint64_t, /* Unsigned int 64. */
+};
+
+/* A message option. */
+struct dm_message_option {
+ unsigned num_options;
+ char **options;
+ unsigned long *actions;
+};
+
+/* Message arguments and types. */
+struct dm_message_argument {
+ unsigned num_args;
+ unsigned long **ptr;
+ enum dm_message_argument_type types[];
+};
+
+/* Client message. */
+struct dm_msg {
+ unsigned long action; /* Identified action. */
+ unsigned long ret; /* Return bits. */
+ unsigned num_specs; /* # of sepcifications listed. */
+ struct dm_msg_spec *specs; /* Specification list. */
+ struct dm_msg_spec *spec; /* Specification selected. */
+};
+
+/* Secification of the message. */
+struct dm_msg_spec {
+ const char *cmd; /* Name of the command (i.e. 'bandwidth'). */
+ unsigned long action;
+ struct dm_message_option *options;
+ struct dm_message_argument *args;
+ unsigned long parm; /* Parameter to pass through to callback. */
+ /* Function to process for action. */
+ int (*f) (struct dm_msg *msg, void *context);
+};
+
+/* Parameter access macros. */
+#define DM_MSG_PARM(msg) ((msg)->spec->parm)
+
+#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx])
+#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx])
+#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx))
+#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx])
+
+#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0)
+#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0)
+#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0)
+#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0)
+
+
+/* Parse a message and its options and optionally call a function back. */
+int dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
+ int argc, char **argv);
+
+#endif
--- /dev/null
+++ b/drivers/md/dm-raid45.c
@@ -0,0 +1,4524 @@
+/*
+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
+ *
+ * This file is released under the GPL.
+ *
+ *
+ * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ *
+ * Supports:
+ * o RAID4 with dedicated and selectable parity device
+ * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ * o run time optimization of xor algorithm used to calculate parity
+ *
+ *
+ * Thanks to MD for:
+ * o the raid address calculation algorithm
+ * o the base of the biovec <-> page list copier.
+ *
+ *
+ * Uses region hash to keep track of how many writes are in flight to
+ * regions in order to use dirty log to keep state of regions to recover:
+ *
+ * o clean regions (those which are synchronized
+ * and don't have write io in flight)
+ * o dirty regions (those with write io in flight)
+ *
+ *
+ * On startup, any dirty regions are migrated to the 'nosync' state
+ * and are subject to recovery by the daemon.
+ *
+ * See raid_ctr() for table definition.
+ *
+ *
+ * FIXME:
+ * o add virtual interface for locking
+ * o remove instrumentation (REMOVEME:)
+ *
+ */
+
+static const char *version = "v0.2431";
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include "dm-message.h"
+#include "dm-raid45.h"
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+/* # of parallel recovered regions */
+/* FIXME: cope with multiple recovery stripes in raid_set struct. */
+#define MAX_RECOVER 1 /* needs to be 1! */
+
+/*
+ * Configurable parameters
+ */
+#define INLINE
+
+/* Default # of stripes if not set in constructor. */
+#define STRIPES 64
+
+/* Minimum/maximum # of selectable stripes. */
+#define STRIPES_MIN 8
+#define STRIPES_MAX 16384
+
+/* Default chunk size in sectors if not set in constructor. */
+#define CHUNK_SIZE 64
+
+/* Default io size in sectors if not set in constructor. */
+#define IO_SIZE_MIN SECTORS_PER_PAGE
+#define IO_SIZE IO_SIZE_MIN
+
+/* Maximum setable chunk size in sectors. */
+#define CHUNK_SIZE_MAX 16384
+
+/* Recover io size default in sectors. */
+#define RECOVER_IO_SIZE_MIN 64
+#define RECOVER_IO_SIZE 256
+
+/* Default percentage recover io bandwidth. */
+#define BANDWIDTH 10
+#define BANDWIDTH_MIN 1
+#define BANDWIDTH_MAX 100
+/*
+ * END Configurable parameters
+ */
+
+#define TARGET "dm-raid45"
+#define DAEMON "kraid45d"
+#define DM_MSG_PREFIX TARGET
+
+#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
+
+/* Amount/size for __xor(). */
+#define SECTORS_PER_XOR SECTORS_PER_PAGE
+#define XOR_SIZE PAGE_SIZE
+
+/* Derive raid_set from stripe_cache pointer. */
+#define RS(x) container_of(x, struct raid_set, sc)
+
+/* Check value in range. */
+#define range_ok(i, min, max) (i >= min && i <= max)
+
+/* Page reference. */
+#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
+
+/* Bio list reference. */
+#define BL(stripe, p, rw) (stripe->ss[p].bl + rw)
+
+/* Page list reference. */
+#define PL(stripe, p) (stripe->obj[p].pl)
+
+/* Check argument is power of 2. */
+#define POWER_OF_2(a) (!(a & (a - 1)))
+
+/* Factor out to dm-bio-list.h */
+static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+{
+ bio->bi_next = bl->head;
+ bl->head = bio;
+
+ if (!bl->tail)
+ bl->tail = bio;
+}
+
+/* Factor out to dm.h */
+#define TI_ERR_RET(str, ret) \
+ do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
+#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
+
+/*-----------------------------------------------------------------
+ * Stripe cache
+ *
+ * Cache for all reads and writes to raid sets (operational or degraded)
+ *
+ * We need to run all data to and from a RAID set through this cache,
+ * because parity chunks need to get calculated from data chunks
+ * or, in the degraded/resynchronization case, missing chunks need
+ * to be reconstructed using the other chunks of the stripe.
+ *---------------------------------------------------------------*/
+/* Protect kmem cache # counter. */
+static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+
+/* A stripe set (holds bios hanging off). */
+struct stripe_set {
+ struct stripe *stripe; /* Backpointer to stripe for endio(). */
+ struct bio_list bl[3]; /* Reads, writes, and writes merged. */
+#define WRITE_MERGED 2
+};
+
+#if READ != 0 || WRITE != 1
+#error dm-raid45: READ/WRITE != 0/1 used as index!!!
+#endif
+
+/*
+ * Stripe linked list indexes. Keep order, because the stripe
+ * and the stripe cache rely on the first 3!
+ */
+enum list_types {
+ LIST_IO = 0, /* Stripes with io pending. */
+ LIST_ENDIO, /* Stripes to endio. */
+ LIST_LRU, /* Least recently used stripes. */
+ LIST_HASH, /* Hashed stripes. */
+ LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
+ NR_LISTS, /* To size array in struct stripe. */
+};
+
+enum lock_types {
+ LOCK_ENDIO = 0, /* Protect endio list. */
+ LOCK_LRU, /* Protect lru list. */
+ NR_LOCKS, /* To size array in struct stripe_cache. */
+};
+
+/* A stripe: the io object to handle all reads and writes to a RAID set. */
+struct stripe {
+ struct stripe_cache *sc; /* Backpointer to stripe cache. */
+
+ sector_t key; /* Hash key. */
+ region_t region; /* Region stripe is mapped to. */
+
+ /* Reference count. */
+ atomic_t cnt;
+
+ struct {
+ unsigned long flags; /* flags (see below). */
+
+ /*
+ * Pending ios in flight:
+ *
+ * used as a 'lock' to control move of stripe to endio list
+ */
+ atomic_t pending; /* Pending ios in flight. */
+
+ /* Sectors to read and write for multi page stripe sets. */
+ unsigned size;
+ } io;
+
+ /* Lock on stripe (for clustering). */
+ void *lock;
+
+ /*
+ * 4 linked lists:
+ * o io list to flush io
+ * o endio list
+ * o LRU list to put stripes w/o reference count on
+ * o stripe cache hash
+ */
+ struct list_head lists[NR_LISTS];
+
+ struct {
+ unsigned short parity; /* Parity chunk index. */
+ short recover; /* Recovery chunk index. */
+ } idx;
+
+ /* This sets memory cache object (dm-mem-cache). */
+ struct dm_mem_cache_object *obj;
+
+ /* Array of stripe sets (dynamically allocated). */
+ struct stripe_set ss[0];
+};
+
+/* States stripes can be in (flags field). */
+enum stripe_states {
+ STRIPE_ACTIVE, /* Active io on stripe. */
+ STRIPE_ERROR, /* io error on stripe. */
+ STRIPE_MERGED, /* Writes got merged. */
+ STRIPE_READ, /* Read. */
+ STRIPE_RBW, /* Read-before-write. */
+ STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */
+ STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
+};
+
+/* ... and macros to access them. */
+#define BITOPS(name, what, var, flag) \
+static inline int TestClear ## name ## what(struct var *v) \
+{ return test_and_clear_bit(flag, &v->io.flags); } \
+static inline int TestSet ## name ## what(struct var *v) \
+{ return test_and_set_bit(flag, &v->io.flags); } \
+static inline void Clear ## name ## what(struct var *v) \
+{ clear_bit(flag, &v->io.flags); } \
+static inline void Set ## name ## what(struct var *v) \
+{ set_bit(flag, &v->io.flags); } \
+static inline int name ## what(struct var *v) \
+{ return test_bit(flag, &v->io.flags); }
+
+
+BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
+BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
+BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
+BITOPS(Stripe, Read, stripe, STRIPE_READ)
+BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
+BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
+BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
+
+/* A stripe hash. */
+struct stripe_hash {
+ struct list_head *hash;
+ unsigned buckets;
+ unsigned mask;
+ unsigned prime;
+ unsigned shift;
+};
+
+/* A stripe cache. */
+struct stripe_cache {
+ /* Stripe hash. */
+ struct stripe_hash hash;
+
+ /* Stripes with io to flush, stripes to endio and LRU lists. */
+ struct list_head lists[3];
+
+ /* Locks to protect endio and lru lists. */
+ spinlock_t locks[NR_LOCKS];
+
+ /* Slab cache to allocate stripes from. */
+ struct {
+ struct kmem_cache *cache; /* Cache itself. */
+ char name[32]; /* Unique name. */
+ } kc;
+
+ struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+
+ /* dm-mem-cache client resource context. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ int stripes_parm; /* # stripes parameter from constructor. */
+ atomic_t stripes; /* actual # of stripes in cache. */
+ atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
+ atomic_t stripes_last; /* last # of stripes in cache. */
+ atomic_t active_stripes; /* actual # of active stripes in cache. */
+
+ /* REMOVEME: */
+ atomic_t max_active_stripes; /* actual # of active stripes in cache. */
+};
+
+/* Flag specs for raid_dev */ ;
+enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
+
+/* The raid device in a set. */
+struct raid_dev {
+ struct dm_dev *dev;
+ unsigned long flags; /* raid_dev_flags. */
+ sector_t start; /* offset to map to. */
+};
+
+/* Flags spec for raid_set. */
+enum raid_set_flags {
+ RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
+ RS_DEAD, /* RAID set inoperational. */
+ RS_DEVEL_STATS, /* REMOVEME: display status information. */
+ RS_IO_ERROR, /* io error on set. */
+ RS_RECOVER, /* Do recovery. */
+ RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
+ RS_REGION_GET, /* get a region to recover. */
+ RS_SC_BUSY, /* stripe cache busy -> send an event. */
+ RS_SUSPENDED, /* RAID set suspendedn. */
+};
+
+/* REMOVEME: devel stats counters. */
+enum stats_types {
+ S_BIOS_READ,
+ S_BIOS_ADDED_READ,
+ S_BIOS_ENDIO_READ,
+ S_BIOS_WRITE,
+ S_BIOS_ADDED_WRITE,
+ S_BIOS_ENDIO_WRITE,
+ S_CAN_MERGE,
+ S_CANT_MERGE,
+ S_CONGESTED,
+ S_DM_IO_READ,
+ S_DM_IO_WRITE,
+ S_ACTIVE_READS,
+ S_BANDWIDTH,
+ S_BARRIER,
+ S_BIO_COPY_PL_NEXT,
+ S_DEGRADED,
+ S_DELAYED_BIOS,
+ S_EVICT,
+ S_FLUSHS,
+ S_HITS_1ST,
+ S_IOS_POST,
+ S_INSCACHE,
+ S_MAX_LOOKUP,
+ S_MERGE_PAGE_LOCKED,
+ S_NO_BANDWIDTH,
+ S_NOT_CONGESTED,
+ S_NO_RW,
+ S_NOSYNC,
+ S_PROHIBITPAGEIO,
+ S_RECONSTRUCT_EI,
+ S_RECONSTRUCT_DEV,
+ S_REDO,
+ S_REQUEUE,
+ S_STRIPE_ERROR,
+ S_SUM_DELAYED_BIOS,
+ S_XORS,
+ S_NR_STATS, /* # of stats counters. */
+};
+
+/* Status type -> string mappings. */
+struct stats_map {
+ const enum stats_types type;
+ const char *str;
+};
+
+static struct stats_map stats_map[] = {
+ { S_BIOS_READ, "r=" },
+ { S_BIOS_ADDED_READ, "/" },
+ { S_BIOS_ENDIO_READ, "/" },
+ { S_BIOS_WRITE, " w=" },
+ { S_BIOS_ADDED_WRITE, "/" },
+ { S_BIOS_ENDIO_WRITE, "/" },
+ { S_DM_IO_READ, " rc=" },
+ { S_DM_IO_WRITE, " wc=" },
+ { S_ACTIVE_READS, " active_reads=" },
+ { S_BANDWIDTH, " bandwidth=" },
+ { S_NO_BANDWIDTH, " no_bandwidth=" },
+ { S_BARRIER, " barrier=" },
+ { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
+ { S_CAN_MERGE, " can_merge=" },
+ { S_MERGE_PAGE_LOCKED, "/page_locked=" },
+ { S_CANT_MERGE, "/cant_merge=" },
+ { S_CONGESTED, " congested=" },
+ { S_NOT_CONGESTED, "/not_congested=" },
+ { S_DEGRADED, " degraded=" },
+ { S_DELAYED_BIOS, " delayed_bios=" },
+ { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
+ { S_EVICT, " evict=" },
+ { S_FLUSHS, " flushs=" },
+ { S_HITS_1ST, " hits_1st=" },
+ { S_IOS_POST, " ios_post=" },
+ { S_INSCACHE, " inscache=" },
+ { S_MAX_LOOKUP, " max_lookup=" },
+ { S_NO_RW, " no_rw=" },
+ { S_NOSYNC, " nosync=" },
+ { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
+ { S_RECONSTRUCT_EI, " reconstruct_ei=" },
+ { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
+ { S_REDO, " redo=" },
+ { S_REQUEUE, " requeue=" },
+ { S_STRIPE_ERROR, " stripe_error=" },
+ { S_XORS, " xors=" },
+};
+
+/*
+ * A RAID set.
+ */
+typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+struct raid_set {
+ struct dm_target *ti; /* Target pointer. */
+
+ struct {
+ unsigned long flags; /* State flags. */
+ spinlock_t in_lock; /* Protects central input list below. */
+ struct bio_list in; /* Pending ios (central input list). */
+ struct bio_list work; /* ios work set. */
+ wait_queue_head_t suspendq; /* suspend synchronization. */
+ atomic_t in_process; /* counter of queued bios (suspendq). */
+ atomic_t in_process_max;/* counter of queued bios max. */
+
+ /* io work. */
+ struct workqueue_struct *wq;
+ struct delayed_work dws;
+ } io;
+
+ /* External locking. */
+ struct dm_raid45_locking_type *locking;
+
+ struct stripe_cache sc; /* Stripe cache for this set. */
+
+ /* Xor optimization. */
+ struct {
+ struct xor_func *f;
+ unsigned chunks;
+ unsigned speed;
+ } xor;
+
+ /* Recovery parameters. */
+ struct recover {
+ struct dm_dirty_log *dl; /* Dirty log. */
+ struct dm_region_hash *rh; /* Region hash. */
+
+ /* dm-mem-cache client resource context for recovery stripes. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ struct list_head stripes; /* List of recovery stripes. */
+
+ region_t nr_regions;
+ region_t nr_regions_to_recover;
+ region_t nr_regions_recovered;
+ unsigned long start_jiffies;
+ unsigned long end_jiffies;
+
+ unsigned bandwidth; /* Recovery bandwidth [%]. */
+ unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+ unsigned bandwidth_parm; /* " constructor parm. */
+ unsigned io_size; /* io size <= chunk size. */
+ unsigned io_size_parm; /* io size ctr parameter. */
+
+ /* recovery io throttling. */
+ atomic_t io_count[2]; /* counter recover/regular io. */
+ unsigned long last_jiffies;
+
+ struct dm_region *reg; /* Actual region to recover. */
+ sector_t pos; /* Position within region to recover. */
+ sector_t end; /* End of region to recover. */
+ } recover;
+
+ /* RAID set parameters. */
+ struct {
+ struct raid_type *raid_type; /* RAID type (eg, RAID4). */
+ unsigned raid_parms; /* # variable raid parameters. */
+
+ unsigned chunk_size; /* Sectors per chunk. */
+ unsigned chunk_size_parm;
+ unsigned chunk_mask; /* Mask for amount. */
+ unsigned chunk_shift; /* rsector chunk size shift. */
+
+ unsigned io_size; /* Sectors per io. */
+ unsigned io_size_parm;
+ unsigned io_mask; /* Mask for amount. */
+ unsigned io_shift_mask; /* Mask for raid_address(). */
+ unsigned io_shift; /* rsector io size shift. */
+ unsigned pages_per_io; /* Pages per io. */
+
+ sector_t sectors_per_dev; /* Sectors per device. */
+
+ atomic_t failed_devs; /* Amount of devices failed. */
+
+ /* Index of device to initialize. */
+ int dev_to_init;
+ int dev_to_init_parm;
+
+ /* Raid devices dynamically allocated. */
+ unsigned raid_devs; /* # of RAID devices below. */
+ unsigned data_devs; /* # of RAID data devices. */
+
+ int ei; /* index of failed RAID device. */
+
+ /* index of dedicated parity device (i.e. RAID4). */
+ int pi;
+ int pi_parm; /* constructor parm for status output. */
+ } set;
+
+ /* REMOVEME: devel stats counters. */
+ atomic_t stats[S_NR_STATS];
+
+ /* Dynamically allocated temporary pointers for xor(). */
+ unsigned long **data;
+
+ /* Dynamically allocated RAID devices. Alignment? */
+ struct raid_dev dev[0];
+};
+
+
+BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+BITOPS(RS, Dead, raid_set, RS_DEAD)
+BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
+BITOPS(RS, Recover, raid_set, RS_RECOVER)
+BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
+BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
+#undef BITOPS
+
+#define PageIO(page) PageChecked(page)
+#define AllowPageIO(page) SetPageChecked(page)
+#define ProhibitPageIO(page) ClearPageChecked(page)
+
+/*-----------------------------------------------------------------
+ * Raid-4/5 set structures.
+ *---------------------------------------------------------------*/
+/* RAID level definitions. */
+enum raid_level {
+ raid4,
+ raid5,
+};
+
+/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+enum raid_algorithm {
+ none,
+ left_asym,
+ right_asym,
+ left_sym,
+ right_sym,
+};
+
+struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const enum raid_level level; /* RAID level. */
+ const enum raid_algorithm algorithm; /* RAID algorithm. */
+};
+
+/* Supported raid types and properties. */
+static struct raid_type raid_types[] = {
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
+};
+
+/* Address as calculated by raid_address(). */
+struct address {
+ sector_t key; /* Hash key (start address of stripe). */
+ unsigned di, pi; /* Data and parity disks index. */
+};
+
+/* REMOVEME: reset statistics counters. */
+static void stats_reset(struct raid_set *rs)
+{
+ unsigned s = S_NR_STATS;
+
+ while (s--)
+ atomic_set(rs->stats + s, 0);
+}
+
+/*----------------------------------------------------------------
+ * RAID set management routines.
+ *--------------------------------------------------------------*/
+/*
+ * Begin small helper functions.
+ */
+/* Queue (optionally delayed) io work. */
+static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+{
+ struct delayed_work *dws = &rs->io.dws;
+
+ cancel_delayed_work(dws);
+ queue_delayed_work(rs->io.wq, dws, delay);
+}
+
+/* Queue io work immediately (called from region hash too). */
+static INLINE void wake_do_raid(void *context)
+{
+ wake_do_raid_delayed(context, 0);
+}
+
+/* Wait until all io has been processed. */
+static INLINE void wait_ios(struct raid_set *rs)
+{
+ wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
+}
+
+/* Declare io queued to device. */
+static INLINE void io_dev_queued(struct raid_dev *dev)
+{
+ set_bit(IO_QUEUED, &dev->flags);
+}
+
+/* Io on device and reset ? */
+static inline int io_dev_clear(struct raid_dev *dev)
+{
+ return test_and_clear_bit(IO_QUEUED, &dev->flags);
+}
+
+/* Get an io reference. */
+static INLINE void io_get(struct raid_set *rs)
+{
+ int p = atomic_inc_return(&rs->io.in_process);
+
+ if (p > atomic_read(&rs->io.in_process_max))
+ atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+}
+
+/* Put the io reference and conditionally wake io waiters. */
+static INLINE void io_put(struct raid_set *rs)
+{
+ /* Intel: rebuild data corrupter? */
+ if (!atomic_read(&rs->io.in_process)) {
+ DMERR("%s would go negative!!!", __func__);
+ return;
+ }
+
+ if (atomic_dec_and_test(&rs->io.in_process))
+ wake_up(&rs->io.suspendq);
+}
+
+/* Calculate device sector offset. */
+static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector;
+
+ sector_div(sector, rs->set.data_devs);
+ return sector;
+}
+
+/* Test device operational. */
+static INLINE int dev_operational(struct raid_set *rs, unsigned p)
+{
+ return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
+}
+
+/* Return # of active stripes in stripe cache. */
+static INLINE int sc_active(struct stripe_cache *sc)
+{
+ return atomic_read(&sc->active_stripes);
+}
+
+/* Test io pending on stripe. */
+static INLINE int stripe_io(struct stripe *stripe)
+{
+ return atomic_read(&stripe->io.pending);
+}
+
+static INLINE void stripe_io_inc(struct stripe *stripe)
+{
+ atomic_inc(&stripe->io.pending);
+}
+
+static INLINE void stripe_io_dec(struct stripe *stripe)
+{
+ atomic_dec(&stripe->io.pending);
+}
+
+/* Wrapper needed by for_each_io_dev(). */
+static void _stripe_io_inc(struct stripe *stripe, unsigned p)
+{
+ stripe_io_inc(stripe);
+}
+
+/* Error a stripe. */
+static INLINE void stripe_error(struct stripe *stripe, struct page *page)
+{
+ SetStripeError(stripe);
+ SetPageError(page);
+ atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
+}
+
+/* Page IOed ok. */
+enum dirty_type { CLEAN, DIRTY };
+static INLINE void page_set(struct page *page, enum dirty_type type)
+{
+ switch (type) {
+ case DIRTY:
+ SetPageDirty(page);
+ AllowPageIO(page);
+ break;
+
+ case CLEAN:
+ ClearPageDirty(page);
+ break;
+
+ default:
+ BUG();
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+}
+
+/* Return region state for a sector. */
+static INLINE int
+region_state(struct raid_set *rs, sector_t sector, unsigned long state)
+{
+ struct dm_region_hash *rh = rs->recover.rh;
+
+ return RSRecover(rs) ?
+ (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
+ state) : 0;
+}
+
+/* Check maximum devices which may fail in a raid set. */
+static inline int raid_set_degraded(struct raid_set *rs)
+{
+ return RSIoError(rs);
+}
+
+/* Check # of devices which may fail in a raid set. */
+static INLINE int raid_set_operational(struct raid_set *rs)
+{
+ /* Too many failed devices -> BAD. */
+ return atomic_read(&rs->set.failed_devs) <=
+ rs->set.raid_type->parity_devs;
+}
+
+/*
+ * Return true in case a page_list should be read/written
+ *
+ * Conditions to read/write:
+ * o 1st page in list not uptodate
+ * o 1st page in list dirty
+ * o if we optimized io away, we flag it using the pages checked bit.
+ */
+static INLINE unsigned page_io(struct page *page)
+{
+ /* Optimization: page was flagged to need io during first run. */
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ return 1;
+ }
+
+ /* Avoid io if prohibited or a locked page. */
+ if (!PageIO(page) || PageLocked(page))
+ return 0;
+
+ if (!PageUptodate(page) || PageDirty(page)) {
+ /* Flag page needs io for second run optimization. */
+ SetPagePrivate(page);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Call a function on each page list needing io. */
+static INLINE unsigned
+for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
+ void (*f_io)(struct stripe *stripe, unsigned p))
+{
+ unsigned p = rs->set.raid_devs, r = 0;
+
+ while (p--) {
+ if (page_io(PAGE(stripe, p))) {
+ f_io(stripe, p);
+ r++;
+ }
+ }
+
+ return r;
+}
+
+/* Reconstruct a particular device ?. */
+static INLINE int dev_to_init(struct raid_set *rs)
+{
+ return rs->set.dev_to_init > -1;
+}
+
+/*
+ * Index of device to calculate parity on.
+ * Either the parity device index *or* the selected device to init
+ * after a spare replacement.
+ */
+static INLINE unsigned dev_for_parity(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
+}
+
+/* Return the index of the device to be recovered. */
+static int idx_get(struct raid_set *rs)
+{
+ /* Avoid to read in the pages to be reconstructed anyway. */
+ if (dev_to_init(rs))
+ return rs->set.dev_to_init;
+ else if (rs->set.raid_type->level == raid4)
+ return rs->set.pi;
+
+ return -1;
+}
+
+/* RAID set congested function. */
+static int raid_set_congested(void *congested_data, int bdi_bits)
+{
+ struct raid_set *rs = congested_data;
+ int r = 0; /* Assume uncongested. */
+ unsigned p = rs->set.raid_devs;
+
+ /* If any of our component devices are overloaded. */
+ while (p--) {
+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+ r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ }
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+ return r;
+}
+
+/* Display RAID set dead message once. */
+static void raid_set_dead(struct raid_set *rs)
+{
+ if (!TestSetRSDead(rs)) {
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("FATAL: too many devices failed -> RAID set dead");
+
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ if (!dev_operational(rs, p))
+ DMERR("device /dev/%s failed",
+ bdevname(rs->dev[p].dev->bdev, buf));
+ }
+ }
+}
+
+/* RAID set degrade check. */
+static INLINE int
+raid_set_check_and_degrade(struct raid_set *rs,
+ struct stripe *stripe, unsigned p)
+{
+ if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ return -EPERM;
+
+ /* Through an event in case of member device errors. */
+ dm_table_event(rs->ti->table);
+ atomic_inc(&rs->set.failed_devs);
+
+ /* Only log the first member error. */
+ if (!TestSetRSIoError(rs)) {
+ char buf[BDEVNAME_SIZE];
+
+ /* Store index for recovery. */
+ mb();
+ rs->set.ei = p;
+ mb();
+
+ DMERR("CRITICAL: %sio error on device /dev/%s "
+ "in region=%llu; DEGRADING RAID set",
+ stripe ? "" : "FAKED ",
+ bdevname(rs->dev[p].dev->bdev, buf),
+ (unsigned long long) (stripe ? stripe->key : 0));
+ DMERR("further device error messages suppressed");
+ }
+
+ return 0;
+}
+
+static void
+raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
+{
+ unsigned p = rs->set.raid_devs;
+
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ if (PageError(page)) {
+ ClearPageError(page);
+ raid_set_check_and_degrade(rs, stripe, p);
+ }
+ }
+}
+
+/* RAID set upgrade check. */
+static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
+{
+ if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
+ return -EPERM;
+
+ if (atomic_dec_and_test(&rs->set.failed_devs)) {
+ ClearRSIoError(rs);
+ rs->set.ei = -1;
+ }
+
+ return 0;
+}
+
+/* Lookup a RAID device by name or by major:minor number. */
+union dev_lookup {
+ const char *dev_name;
+ struct raid_dev *dev;
+};
+enum lookup_type { byname, bymajmin, bynumber };
+static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
+ union dev_lookup *dl)
+{
+ unsigned p;
+
+ /*
+ * Must be an incremental loop, because the device array
+ * can have empty slots still on calls from raid_ctr()
+ */
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ char buf[BDEVNAME_SIZE];
+ struct raid_dev *dev = rs->dev + p;
+
+ if (!dev->dev)
+ break;
+
+ /* Format dev string appropriately if necessary. */
+ if (by == byname)
+ bdevname(dev->dev->bdev, buf);
+ else if (by == bymajmin)
+ format_dev_t(buf, dev->dev->bdev->bd_dev);
+
+ /* Do the actual check. */
+ if (by == bynumber) {
+ if (dl->dev->dev->bdev->bd_dev ==
+ dev->dev->bdev->bd_dev)
+ return p;
+ } else if (!strcmp(dl->dev_name, buf))
+ return p;
+ }
+
+ return -ENODEV;
+}
+
+/* End io wrapper. */
+static INLINE void
+_bio_endio(struct raid_set *rs, struct bio *bio, int error)
+{
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
+ S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
+ bio_endio(bio, error);
+ io_put(rs); /* Wake any suspend waiters. */
+}
+
+/*
+ * End small helper functions.
+ */
+
+
+/*
+ * Stripe hash functions
+ */
+/* Initialize/destroy stripe hash. */
+static int hash_init(struct stripe_hash *hash, unsigned stripes)
+{
+ unsigned buckets = 2, max_buckets = stripes / 4;
+ unsigned hash_primes[] = {
+ /* Table of primes for hash_fn/table size optimization. */
+ 3, 7, 13, 27, 53, 97, 193, 389, 769,
+ 1543, 3079, 6151, 12289, 24593,
+ };
+
+ /* Calculate number of buckets (2^^n <= stripes / 4). */
+ while (buckets < max_buckets)
+ buckets <<= 1;
+
+ /* Allocate stripe hash. */
+ hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+ if (!hash->hash)
+ return -ENOMEM;
+
+ hash->buckets = buckets;
+ hash->mask = buckets - 1;
+ hash->shift = ffs(buckets);
+ if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
+ hash->shift = ARRAY_SIZE(hash_primes) + 1;
+
+ BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
+ hash->prime = hash_primes[hash->shift - 2];
+
+ /* Initialize buckets. */
+ while (buckets--)
+ INIT_LIST_HEAD(hash->hash + buckets);
+
+ return 0;
+}
+
+static INLINE void hash_exit(struct stripe_hash *hash)
+{
+ if (hash->hash) {
+ vfree(hash->hash);
+ hash->hash = NULL;
+ }
+}
+
+/* List add (head/tail/locked/unlocked) inlines. */
+enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
+#define LIST_DEL(name, list) \
+static void stripe_ ## name ## _del(struct stripe *stripe, \
+ enum list_lock_type lock) { \
+ struct list_head *lh = stripe->lists + (list); \
+ spinlock_t *l = NULL; \
+\
+ if (lock == LIST_LOCKED) { \
+ l = stripe->sc->locks + LOCK_LRU; \
+ spin_lock_irq(l); \
+ } \
+\
+\
+ if (!list_empty(lh)) \
+ list_del_init(lh); \
+\
+ if (lock == LIST_LOCKED) \
+ spin_unlock_irq(l); \
+}
+
+LIST_DEL(hash, LIST_HASH)
+LIST_DEL(lru, LIST_LRU)
+#undef LIST_DEL
+
+enum list_pos_type { POS_HEAD, POS_TAIL };
+#define LIST_ADD(name, list) \
+static void stripe_ ## name ## _add(struct stripe *stripe, \
+ enum list_pos_type pos, \
+ enum list_lock_type lock) { \
+ struct list_head *lh = stripe->lists + (list); \
+ struct stripe_cache *sc = stripe->sc; \
+ spinlock_t *l = NULL; \
+\
+ if (lock == LIST_LOCKED) { \
+ l = sc->locks + LOCK_LRU; \
+ spin_lock_irq(l); \
+ } \
+\
+ if (list_empty(lh)) { \
+ if (pos == POS_HEAD) \
+ list_add(lh, sc->lists + (list)); \
+ else \
+ list_add_tail(lh, sc->lists + (list)); \
+ } \
+\
+ if (lock == LIST_LOCKED) \
+ spin_unlock_irq(l); \
+}
+
+LIST_ADD(endio, LIST_ENDIO)
+LIST_ADD(io, LIST_IO)
+LIST_ADD(lru, LIST_LRU)
+#undef LIST_ADD
+
+#define POP(list) \
+ do { \
+ if (list_empty(sc->lists + list)) \
+ stripe = NULL; \
+ else { \
+ stripe = list_first_entry(&sc->lists[list], \
+ struct stripe, \
+ lists[list]); \
+ list_del_init(&stripe->lists[list]); \
+ } \
+ } while (0);
+
+/* Pop an available stripe off the lru list. */
+static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+ spinlock_t *lock = sc->locks + LOCK_LRU;
+
+ spin_lock_irq(lock);
+ POP(LIST_LRU);
+ spin_unlock_irq(lock);
+
+ if (stripe)
+ /* Remove from hash before reuse. */
+ stripe_hash_del(stripe, LIST_UNLOCKED);
+
+ return stripe;
+}
+
+static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+{
+ return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+}
+
+static inline struct list_head *
+hash_bucket(struct stripe_hash *hash, sector_t key)
+{
+ return hash->hash + hash_fn(hash, key);
+}
+
+/* Insert an entry into a hash. */
+static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
+{
+ list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+}
+
+/* Insert an entry into the stripe hash. */
+static inline void
+sc_insert(struct stripe_cache *sc, struct stripe *stripe)
+{
+ hash_insert(&sc->hash, stripe);
+}
+
+/* Lookup an entry in the stripe hash. */
+static inline struct stripe *
+stripe_lookup(struct stripe_cache *sc, sector_t key)
+{
+ unsigned c = 0;
+ struct stripe *stripe;
+ struct list_head *bucket = hash_bucket(&sc->hash, key);
+
+ list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+ /* REMOVEME: statisics. */
+ if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+ atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
+
+ if (stripe->key == key)
+ return stripe;
+ }
+
+ return NULL;
+}
+
+/* Resize the stripe cache hash on size changes. */
+static int hash_resize(struct stripe_cache *sc)
+{
+ /* Resize threshold reached? */
+ if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
+ || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
+ int r;
+ struct stripe_hash hash, hash_tmp;
+ spinlock_t *lock;
+
+ r = hash_init(&hash, atomic_read(&sc->stripes));
+ if (r)
+ return r;
+
+ lock = sc->locks + LOCK_LRU;
+ spin_lock_irq(lock);
+ if (sc->hash.hash) {
+ unsigned b = sc->hash.buckets;
+ struct list_head *pos, *tmp;
+
+ /* Walk old buckets and insert into new. */
+ while (b--) {
+ list_for_each_safe(pos, tmp, sc->hash.hash + b)
+ hash_insert(&hash,
+ list_entry(pos, struct stripe,
+ lists[LIST_HASH]));
+ }
+
+ }
+
+ memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
+ memcpy(&sc->hash, &hash, sizeof(sc->hash));
+ atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+ spin_unlock_irq(lock);
+
+ hash_exit(&hash_tmp);
+ }
+
+ return 0;
+}
+
+/*
+ * Stripe cache locking functions
+ */
+/* Dummy lock function for local RAID4+5. */
+static void *no_lock(sector_t key, enum dm_lock_type type)
+{
+ return &no_lock;
+}
+
+/* Dummy unlock function for local RAID4+5. */
+static void no_unlock(void *lock_handle)
+{
+}
+
+/* No locking (for local RAID 4+5). */
+static struct dm_raid45_locking_type locking_none = {
+ .lock = no_lock,
+ .unlock = no_unlock,
+};
+
+/* Clustered RAID 4+5. */
+/* FIXME: code this. */
+static struct dm_raid45_locking_type locking_cluster = {
+ .lock = no_lock,
+ .unlock = no_unlock,
+};
+
+/* Lock a stripe (for clustering). */
+static int
+stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
+{
+ stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
+ DM_RAID45_EX);
+ return stripe->lock ? 0 : -EPERM;
+}
+
+/* Unlock a stripe (for clustering). */
+static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
+{
+ rs->locking->unlock(stripe->lock);
+ stripe->lock = NULL;
+}
+
+/*
+ * Stripe cache functions.
+ */
+/*
+ * Invalidate all page lists pages of a stripe.
+ *
+ * I only keep state for the whole list in the first page.
+ */
+static INLINE void
+stripe_pages_invalidate(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ ProhibitPageIO(page);
+ ClearPageChecked(page);
+ ClearPageDirty(page);
+ ClearPageError(page);
+ __clear_page_locked(page);
+ ClearPagePrivate(page);
+ ClearPageUptodate(page);
+ }
+}
+
+/* Prepare stripe for (re)use. */
+static INLINE void stripe_invalidate(struct stripe *stripe)
+{
+ stripe->io.flags = 0;
+ stripe_pages_invalidate(stripe);
+}
+
+/* Allow io on all chunks of a stripe. */
+static INLINE void stripe_allow_io(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ AllowPageIO(PAGE(stripe, p));
+}
+
+/* Initialize a stripe. */
+static void
+stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+{
+ unsigned p = RS(sc)->set.raid_devs;
+ unsigned i;
+
+ /* Work all io chunks. */
+ while (p--) {
+ struct stripe_set *ss = stripe->ss + p;
+
+ stripe->obj[p].private = ss;
+ ss->stripe = stripe;
+
+ i = ARRAY_SIZE(ss->bl);
+ while (i--)
+ bio_list_init(ss->bl + i);
+ }
+
+ stripe->sc = sc;
+
+ i = ARRAY_SIZE(stripe->lists);
+ while (i--)
+ INIT_LIST_HEAD(stripe->lists + i);
+
+ atomic_set(&stripe->cnt, 0);
+ atomic_set(&stripe->io.pending, 0);
+
+ stripe_invalidate(stripe);
+}
+
+/* Number of pages per chunk. */
+static inline unsigned chunk_pages(unsigned io_size)
+{
+ return dm_div_up(io_size, SECTORS_PER_PAGE);
+}
+
+/* Number of pages per stripe. */
+static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+{
+ return chunk_pages(io_size) * rs->set.raid_devs;
+}
+
+/* Initialize part of page_list (recovery). */
+static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
+ unsigned start, unsigned count)
+{
+ unsigned pages = chunk_pages(count);
+ /* Get offset into the page_list. */
+ struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
+
+ BUG_ON(!pl);
+ while (pl && pages--) {
+ BUG_ON(!pl->page);
+ memset(page_address(pl->page), 0, PAGE_SIZE);
+ pl = pl->next;
+ }
+}
+
+/* Initialize parity chunk of stripe. */
+static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
+{
+ stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+}
+
+/* Return dynamic stripe structure size. */
+static INLINE size_t stripe_size(struct raid_set *rs)
+{
+ return sizeof(struct stripe) +
+ rs->set.raid_devs * sizeof(struct stripe_set);
+}
+
+/* Allocate a stripe and its memory object. */
+/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+enum grow { SC_GROW, SC_KEEP };
+static struct stripe *stripe_alloc(struct stripe_cache *sc,
+ struct dm_mem_cache_client *mc,
+ enum grow grow)
+{
+ int r;
+ struct stripe *stripe;
+
+ stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+ if (stripe) {
+ /* Grow the dm-mem-cache by one object. */
+ if (grow == SC_GROW) {
+ r = dm_mem_cache_grow(mc, 1);
+ if (r)
+ goto err_free;
+ }
+
+ stripe->obj = dm_mem_cache_alloc(mc);
+ if (!stripe->obj)
+ goto err_shrink;
+
+ stripe_init(sc, stripe);
+ }
+
+ return stripe;
+
+err_shrink:
+ if (grow == SC_GROW)
+ dm_mem_cache_shrink(mc, 1);
+err_free:
+ kmem_cache_free(sc->kc.cache, stripe);
+ return NULL;
+}
+
+/*
+ * Free a stripes memory object, shrink the
+ * memory cache and free the stripe itself
+ */
+static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+{
+ dm_mem_cache_free(mc, stripe->obj);
+ dm_mem_cache_shrink(mc, 1);
+ kmem_cache_free(stripe->sc->kc.cache, stripe);
+}
+
+/* Free the recovery stripe. */
+static void stripe_recover_free(struct raid_set *rs)
+{
+ struct recover *rec = &rs->recover;
+ struct list_head *stripes = &rec->stripes;
+
+ while (!list_empty(stripes)) {
+ struct stripe *stripe = list_first_entry(stripes, struct stripe,
+ lists[LIST_RECOVER]);
+ list_del(stripe->lists + LIST_RECOVER);
+ stripe_free(stripe, rec->mem_cache_client);
+ }
+}
+
+/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+static INLINE void stripe_endio_push(struct stripe *stripe)
+{
+ int wake;
+ unsigned long flags;
+ struct stripe_cache *sc = stripe->sc;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ spin_lock_irqsave(lock, flags);
+ wake = list_empty(sc->lists + LIST_ENDIO);
+ stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
+ spin_unlock_irqrestore(lock, flags);
+
+ if (wake)
+ wake_do_raid(RS(sc));
+}
+
+/* Protected check for stripe cache endio list empty. */
+static INLINE int stripe_endio_empty(struct stripe_cache *sc)
+{
+ int r;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ spin_lock_irq(lock);
+ r = list_empty(sc->lists + LIST_ENDIO);
+ spin_unlock_irq(lock);
+
+ return r;
+}
+
+/* Pop a stripe off safely off the endio list. */
+static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ /* This runs in parallel with endio(). */
+ spin_lock_irq(lock);
+ POP(LIST_ENDIO)
+ spin_unlock_irq(lock);
+ return stripe;
+}
+
+#undef POP
+
+/* Evict stripe from cache. */
+static void stripe_evict(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
+
+ if (list_empty(stripe->lists + LIST_LRU)) {
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
+ }
+}
+
+/* Grow stripe cache. */
+static int
+sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+{
+ int r = 0;
+ struct raid_set *rs = RS(sc);
+
+ /* Try to allocate this many (additional) stripes. */
+ while (stripes--) {
+ struct stripe *stripe =
+ stripe_alloc(sc, sc->mem_cache_client, grow);
+
+ if (likely(stripe)) {
+ stripe->io.size = rs->set.io_size;
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ atomic_inc(&sc->stripes);
+ } else {
+ r = -ENOMEM;
+ break;
+ }
+ }
+
+ ClearRSScBusy(rs);
+ return r ? r : hash_resize(sc);
+}
+
+/* Shrink stripe cache. */
+static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+{
+ int r = 0;
+
+ /* Try to get unused stripe from LRU list. */
+ while (stripes--) {
+ struct stripe *stripe;
+
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ /* An lru stripe may never have ios pending! */
+ BUG_ON(stripe_io(stripe));
+ stripe_free(stripe, sc->mem_cache_client);
+ atomic_dec(&sc->stripes);
+ } else {
+ r = -ENOENT;
+ break;
+ }
+ }
+
+ /* Check if stats are still sane. */
+ if (atomic_read(&sc->max_active_stripes) >
+ atomic_read(&sc->stripes))
+ atomic_set(&sc->max_active_stripes, 0);
+
+ if (r)
+ return r;
+
+ ClearRSScBusy(RS(sc));
+ return hash_resize(sc);
+}
+
+/* Create stripe cache. */
+static int sc_init(struct raid_set *rs, unsigned stripes)
+{
+ unsigned i, nr;
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+ struct recover *rec = &rs->recover;
+
+ /* Initialize lists and locks. */
+ i = ARRAY_SIZE(sc->lists);
+ while (i--)
+ INIT_LIST_HEAD(sc->lists + i);
+
+ i = NR_LOCKS;
+ while (i--)
+ spin_lock_init(sc->locks + i);
+
+ /* Initialize atomic variables. */
+ atomic_set(&sc->stripes, 0);
+ atomic_set(&sc->stripes_last, 0);
+ atomic_set(&sc->stripes_to_shrink, 0);
+ atomic_set(&sc->active_stripes, 0);
+ atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
+
+ /*
+ * We need a runtime unique # to suffix the kmem cache name
+ * because we'll have one for each active RAID set.
+ */
+ nr = atomic_inc_return(&_stripe_sc_nr);
+ sprintf(sc->kc.name, "%s_%d", TARGET, nr);
+ sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+ 0, 0, NULL);
+ if (!sc->kc.cache)
+ return -ENOMEM;
+
+ /* Create memory cache client context for RAID stripe cache. */
+ sc->mem_cache_client =
+ dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+ chunk_pages(rs->set.io_size));
+ if (IS_ERR(sc->mem_cache_client))
+ return PTR_ERR(sc->mem_cache_client);
+
+ /* Create memory cache client context for RAID recovery stripe(s). */
+ rec->mem_cache_client =
+ dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
+ chunk_pages(rec->io_size));
+ if (IS_ERR(rec->mem_cache_client))
+ return PTR_ERR(rec->mem_cache_client);
+
+ /* Allocate stripe for set recovery. */
+ /* XXX: cope with MAX_RECOVERY. */
+ INIT_LIST_HEAD(&rec->stripes);
+ for (i = 0; i < MAX_RECOVER; i++) {
+ stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+ if (!stripe)
+ return -ENOMEM;
+
+ SetStripeRecover(stripe);
+ stripe->io.size = rec->io_size;
+ list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
+ }
+
+ /*
+ * Allocate the stripe objetcs from the
+ * cache and add them to the LRU list.
+ */
+ return sc_grow(sc, stripes, SC_KEEP);
+}
+
+/* Destroy the stripe cache. */
+static void sc_exit(struct stripe_cache *sc)
+{
+ if (sc->kc.cache) {
+ BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+ kmem_cache_destroy(sc->kc.cache);
+ }
+
+ if (sc->mem_cache_client)
+ dm_mem_cache_client_destroy(sc->mem_cache_client);
+
+ ClearRSRecover(RS(sc));
+ stripe_recover_free(RS(sc));
+ if (RS(sc)->recover.mem_cache_client)
+ dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
+
+ hash_exit(&sc->hash);
+}
+
+/*
+ * Calculate RAID address
+ *
+ * Delivers tuple with the index of the data disk holding the chunk
+ * in the set, the parity disks index and the start of the stripe
+ * within the address space of the set (used as the stripe cache hash key).
+ */
+/* thx MD. */
+static struct address *
+raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
+{
+ unsigned data_devs = rs->set.data_devs, di, pi,
+ raid_devs = rs->set.raid_devs;
+ sector_t stripe, tmp;
+
+ /*
+ * chunk_number = sector / chunk_size
+ * stripe = chunk_number / data_devs
+ * di = stripe % data_devs;
+ */
+ stripe = sector >> rs->set.chunk_shift;
+ di = sector_div(stripe, data_devs);
+
+ switch (rs->set.raid_type->level) {
+ case raid5:
+ tmp = stripe;
+ pi = sector_div(tmp, raid_devs);
+
+ switch (rs->set.raid_type->algorithm) {
+ case left_asym: /* Left asymmetric. */
+ pi = data_devs - pi;
+ case right_asym: /* Right asymmetric. */
+ if (di >= pi)
+ di++;
+ break;
+
+ case left_sym: /* Left symmetric. */
+ pi = data_devs - pi;
+ case right_sym: /* Right symmetric. */
+ di = (pi + di + 1) % raid_devs;
+ break;
+
+ default:
+ DMERR("Unknown RAID algorithm %d",
+ rs->set.raid_type->algorithm);
+ goto out;
+ }
+
+ break;
+
+ case raid4:
+ pi = rs->set.pi;
+ if (di >= pi)
+ di++;
+ break;
+
+ default:
+ DMERR("Unknown RAID level %d", rs->set.raid_type->level);
+ goto out;
+ }
+
+ /*
+ * Hash key = start offset on any single device of the RAID set;
+ * adjusted in case io size differs from chunk size.
+ */
+ addr->key = (stripe << rs->set.chunk_shift) +
+ (sector & rs->set.io_shift_mask);
+ addr->di = di;
+ addr->pi = pi;
+
+out:
+ return addr;
+}
+
+/*
+ * Copy data across between stripe pages and bio vectors.
+ *
+ * Pay attention to data alignment in stripe and bio pages.
+ */
+static void
+bio_copy_page_list(int rw, struct stripe *stripe,
+ struct page_list *pl, struct bio *bio)
+{
+ unsigned i, page_offset;
+ void *page_addr;
+ struct raid_set *rs = RS(stripe->sc);
+ struct bio_vec *bv;
+
+ /* Get start page in page list for this sector. */
+ i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+ pl = pl_elem(pl, i);
+
+ page_addr = page_address(pl->page);
+ page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+
+ /* Walk all segments and copy data across between bio_vecs and pages. */
+ bio_for_each_segment(bv, bio, i) {
+ int len = bv->bv_len, size;
+ unsigned bio_offset = 0;
+ void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+redo:
+ size = (page_offset + len > PAGE_SIZE) ?
+ PAGE_SIZE - page_offset : len;
+
+ if (rw == READ)
+ memcpy(bio_addr + bio_offset,
+ page_addr + page_offset, size);
+ else
+ memcpy(page_addr + page_offset,
+ bio_addr + bio_offset, size);
+
+ page_offset += size;
+ if (page_offset == PAGE_SIZE) {
+ /*
+ * We reached the end of the chunk page ->
+ * need refer to the next one to copy more data.
+ */
+ len -= size;
+ if (len) {
+ /* Get next page. */
+ pl = pl->next;
+ BUG_ON(!pl);
+ page_addr = page_address(pl->page);
+ page_offset = 0;
+ bio_offset += size;
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+ goto redo;
+ }
+ }
+
+ __bio_kunmap_atomic(bio_addr, KM_USER0);
+ }
+}
+
+/*
+ * Xor optimization macros.
+ */
+/* Xor data pointer declaration and initialization macros. */
+#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
+#define DECLARE_3 DECLARE_2, *d2 = data[2]
+#define DECLARE_4 DECLARE_3, *d3 = data[3]
+#define DECLARE_5 DECLARE_4, *d4 = data[4]
+#define DECLARE_6 DECLARE_5, *d5 = data[5]
+#define DECLARE_7 DECLARE_6, *d6 = data[6]
+#define DECLARE_8 DECLARE_7, *d7 = data[7]
+
+/* Xor unrole macros. */
+#define D2(n) d0[n] = d0[n] ^ d1[n]
+#define D3(n) D2(n) ^ d2[n]
+#define D4(n) D3(n) ^ d3[n]
+#define D5(n) D4(n) ^ d4[n]
+#define D6(n) D5(n) ^ d5[n]
+#define D7(n) D6(n) ^ d6[n]
+#define D8(n) D7(n) ^ d7[n]
+
+#define X_2(macro, offset) macro(offset); macro(offset + 1);
+#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
+#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
+#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
+#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
+#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
+
+/* Define a _xor_#chunks_#xors_per_run() function. */
+#define _XOR(chunks, xors_per_run) \
+static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+{ \
+ unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+ DECLARE_ ## chunks; \
+\
+ for (i = 0; i < end; i += xors_per_run) { \
+ X_ ## xors_per_run(D ## chunks, i); \
+ } \
+}
+
+/* Define xor functions for 2 - 8 chunks. */
+#define MAKE_XOR_PER_RUN(xors_per_run) \
+ _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+ _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+ _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+ _XOR(8, xors_per_run);
+
+MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
+MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
+MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
+MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
+
+#define MAKE_XOR(xors_per_run) \
+struct { \
+ void (*f)(unsigned long **); \
+} static xor_funcs ## xors_per_run[] = { \
+ { NULL }, \
+ { NULL }, \
+ { _xor2_ ## xors_per_run }, \
+ { _xor3_ ## xors_per_run }, \
+ { _xor4_ ## xors_per_run }, \
+ { _xor5_ ## xors_per_run }, \
+ { _xor6_ ## xors_per_run }, \
+ { _xor7_ ## xors_per_run }, \
+ { _xor8_ ## xors_per_run }, \
+}; \
+\
+static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+{ \
+ /* Call respective function for amount of chunks. */ \
+ xor_funcs ## xors_per_run[n].f(data); \
+}
+
+/* Define xor_8() - xor_64 functions. */
+MAKE_XOR(8)
+MAKE_XOR(16)
+MAKE_XOR(32)
+MAKE_XOR(64)
+
+/* Maximum number of chunks, which can be xor'ed in one go. */
+#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
+
+struct xor_func {
+ xor_function_t f;
+ const char *name;
+} static xor_funcs[] = {
+ {xor_8, "xor_8"},
+ {xor_16, "xor_16"},
+ {xor_32, "xor_32"},
+ {xor_64, "xor_64"},
+};
+
+/*
+ * Calculate crc.
+ *
+ * This indexes into the page list of the stripe.
+ *
+ * All chunks will be xored into the parity chunk
+ * in maximum groups of xor.chunks.
+ *
+ * FIXME: try mapping the pages on discontiguous memory.
+ */
+static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned max_chunks = rs->xor.chunks, n, p;
+ unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
+ unsigned long **d = rs->data;
+ xor_function_t xor_f = rs->xor.f->f;
+
+ /* Address of parity page to xor into. */
+ d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+
+ /* Preset pointers to data pages. */
+ for (n = 1, p = rs->set.raid_devs; p--; ) {
+ if (p != pi && PageIO(PAGE(stripe, p)))
+ d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+
+ /* If max chunks -> xor .*/
+ if (n == max_chunks) {
+ xor_f(n, d);
+ n = 1;
+ }
+ }
+
+ /* If chunks -> xor. */
+ if (n > 1)
+ xor_f(n, d);
+
+ /* Set parity page uptodate and clean. */
+ page_set(PAGE(stripe, pi), CLEAN);
+}
+
+/* Common xor loop through all stripe page lists. */
+static void common_xor(struct stripe *stripe, sector_t count,
+ unsigned off, unsigned p)
+{
+ unsigned sector;
+
+ for (sector = off; sector < count; sector += SECTORS_PER_XOR)
+ xor(stripe, p, sector);
+
+ atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+}
+
+/*
+ * Calculate parity sectors on intact stripes.
+ *
+ * Need to calculate raid address for recover stripe, because its
+ * chunk sizes differs and is typically larger than io chunk size.
+ */
+static void parity_xor(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned chunk_size = rs->set.chunk_size,
+ io_size = stripe->io.size,
+ xor_size = chunk_size > io_size ? io_size : chunk_size;
+ sector_t off;
+
+ /* This can be the recover stripe with a larger io size. */
+ for (off = 0; off < io_size; off += xor_size) {
+ unsigned pi;
+
+ /*
+ * Recover stripe likely is bigger than regular io
+ * ones and has no precalculated parity disk index ->
+ * need to calculate RAID address.
+ */
+ if (unlikely(StripeRecover(stripe))) {
+ struct address addr;
+
+ raid_address(rs,
+ (stripe->key + off) * rs->set.data_devs,
+ &addr);
+ pi = addr.pi;
+ stripe_zero_pl_part(stripe, pi, off,
+ rs->set.chunk_size);
+ } else
+ pi = stripe->idx.parity;
+
+ common_xor(stripe, xor_size, off, pi);
+ page_set(PAGE(stripe, pi), DIRTY);
+ }
+}
+
+/* Reconstruct missing chunk. */
+static void reconstruct_xor(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int p = stripe->idx.recover;
+
+ BUG_ON(p < 0);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (raid_set_degraded(rs) ?
+ S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
+
+ /* Zero chunk to be reconstructed. */
+ stripe_zero_chunk(stripe, p);
+ common_xor(stripe, stripe->io.size, 0, p);
+}
+
+/*
+ * Try getting a stripe either from the hash or from the lru list
+ */
+static inline void _stripe_get(struct stripe *stripe)
+{
+ atomic_inc(&stripe->cnt);
+}
+
+static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
+{
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+
+ stripe = stripe_lookup(sc, addr->key);
+ if (stripe) {
+ _stripe_get(stripe);
+ /* Remove from the lru list if on. */
+ stripe_lru_del(stripe, LIST_LOCKED);
+ atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+ } else {
+ /* Second try to get an LRU stripe. */
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ _stripe_get(stripe);
+ /* Invalidate before reinserting with changed key. */
+ stripe_invalidate(stripe);
+ stripe->key = addr->key;
+ stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+ addr->key);
+ stripe->idx.parity = addr->pi;
+ sc_insert(sc, stripe);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_INSCACHE);
+ }
+ }
+
+ return stripe;
+}
+
+/*
+ * Decrement reference count on a stripe.
+ *
+ * Move it to list of LRU stripes if zero.
+ */
+static void stripe_put(struct stripe *stripe)
+{
+ if (atomic_dec_and_test(&stripe->cnt)) {
+ if (TestClearStripeActive(stripe))
+ atomic_dec(&stripe->sc->active_stripes);
+
+ /* Put stripe onto the LRU list. */
+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
+ }
+
+ BUG_ON(atomic_read(&stripe->cnt) < 0);
+}
+
+/*
+ * Process end io
+ *
+ * I need to do it here because I can't in interrupt
+ *
+ * Read and write functions are split in order to avoid
+ * conditionals in the main loop for performamce reasons.
+ */
+
+/* Helper read bios on a page list. */
+static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
+ struct bio *bio)
+{
+ bio_copy_page_list(READ, stripe, pl, bio);
+}
+
+/* Helper write bios on a page list. */
+static void _rh_dec(struct stripe *stripe, struct page_list *pl,
+ struct bio *bio)
+{
+ dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
+}
+
+/* End io all bios on a page list. */
+static inline int
+page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
+{
+ int r = 0;
+ struct bio_list *bl = BL(stripe, p, rw);
+
+ if (!bio_list_empty(bl)) {
+ struct page_list *pl = PL(stripe, p);
+ struct page *page = pl->page;
+
+ if (PageLocked(page))
+ r = -EBUSY;
+ /*
+ * FIXME: PageUptodate() not cleared
+ * properly for missing chunks ?
+ */
+ else if (PageUptodate(page)) {
+ struct bio *bio;
+ struct raid_set *rs = RS(stripe->sc);
+ void (*h_f)(struct stripe *, struct page_list *,
+ struct bio *) =
+ (rw == READ) ? _bio_copy_page_list : _rh_dec;
+
+ while ((bio = bio_list_pop(bl))) {
+ h_f(stripe, pl, bio);
+ _bio_endio(rs, bio, 0);
+ stripe_put(stripe);
+ if (count)
+ (*count)++;
+ }
+ } else
+ r = -EAGAIN;
+ }
+
+ return r;
+}
+
+/*
+ * End io all reads/writes on a stripe copying
+ * read date accross from stripe to bios.
+ */
+static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
+{
+ int r = 0;
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ int rr = page_list_endio(rw, stripe, p, count);
+
+ if (rr && r != -EIO)
+ r = rr;
+ }
+
+ return r;
+}
+
+/* Fail all ios on a bio list and return # of bios. */
+static unsigned
+bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
+{
+ unsigned r;
+ struct bio *bio;
+
+ raid_set_dead(rs);
+
+ /* Update region counters. */
+ if (stripe) {
+ struct dm_region_hash *rh = rs->recover.rh;
+
+ bio_list_for_each(bio, bl) {
+ if (bio_data_dir(bio) == WRITE)
+ dm_rh_dec(rh, stripe->region);
+ }
+ }
+
+ /* Error end io all bios. */
+ for (r = 0; (bio = bio_list_pop(bl)); r++)
+ _bio_endio(rs, bio, -EIO);
+
+ return r;
+}
+
+/* Fail all ios of a bio list of a stripe and drop io pending count. */
+static void
+stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
+ struct bio_list *bl)
+{
+ unsigned put = bio_list_fail(rs, stripe, bl);
+
+ while (put--)
+ stripe_put(stripe);
+}
+
+/* Fail all ios hanging off all bio lists of a stripe. */
+static void stripe_fail_io(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p = rs->set.raid_devs;
+
+ stripe_evict(stripe);
+
+ while (p--) {
+ struct stripe_set *ss = stripe->ss + p;
+ int i = ARRAY_SIZE(ss->bl);
+
+ while (i--)
+ stripe_bio_list_fail(rs, stripe, ss->bl + i);
+ }
+}
+
+/*
+ * Handle all stripes by handing them to the daemon, because we can't
+ * map their pages to copy the data in interrupt context.
+ *
+ * We don't want to handle them here either, while interrupts are disabled.
+ */
+
+/* Read/write endio function for dm-io (interrupt context). */
+static void endio(unsigned long error, void *context)
+{
+ struct dm_mem_cache_object *obj = context;
+ struct stripe_set *ss = obj->private;
+ struct stripe *stripe = ss->stripe;
+ struct page *page = obj->pl->page;
+
+ if (unlikely(error))
+ stripe_error(stripe, page);
+ else
+ page_set(page, CLEAN);
+
+ __clear_page_locked(page);
+ stripe_io_dec(stripe);
+
+ /* Add stripe to endio list and wake daemon. */
+ stripe_endio_push(stripe);
+}
+
+/*
+ * Recovery io throttling
+ */
+/* Conditionally reset io counters. */
+enum count_type { IO_WORK = 0, IO_RECOVER };
+static int recover_io_reset(struct raid_set *rs)
+{
+ unsigned long j = jiffies;
+
+ /* Pay attention to jiffies overflows. */
+ if (j > rs->recover.last_jiffies + HZ
+ || j < rs->recover.last_jiffies) {
+ rs->recover.last_jiffies = j;
+ atomic_set(rs->recover.io_count + IO_WORK, 0);
+ atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Count ios. */
+static INLINE void
+recover_io_count(struct raid_set *rs, struct stripe *stripe)
+{
+ if (RSRecover(rs)) {
+ recover_io_reset(rs);
+ atomic_inc(rs->recover.io_count +
+ (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+ }
+}
+
+/* Read/Write a page_list asynchronously. */
+static void page_list_rw(struct stripe *stripe, unsigned p)
+{
+ struct stripe_cache *sc = stripe->sc;
+ struct raid_set *rs = RS(sc);
+ struct dm_mem_cache_object *obj = stripe->obj + p;
+ struct page_list *pl = obj->pl;
+ struct page *page = pl->page;
+ struct raid_dev *dev = rs->dev + p;
+ struct dm_io_region io = {
+ .bdev = dev->dev->bdev,
+ .sector = stripe->key,
+ .count = stripe->io.size,
+ };
+ struct dm_io_request control = {
+ .bi_rw = PageDirty(page) ? WRITE : READ,
+ .mem.type = DM_IO_PAGE_LIST,
+ .mem.ptr.pl = pl,
+ .mem.offset = 0,
+ .notify.fn = endio,
+ .notify.context = obj,
+ .client = sc->dm_io_client,
+ };
+
+ BUG_ON(PageLocked(page));
+
+ /*
+ * Don't rw past end of device, which can happen, because
+ * typically sectors_per_dev isn't divisable by io_size.
+ */
+ if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+ io.count = rs->set.sectors_per_dev - io.sector;
+
+ io.sector += dev->start; /* Add <offset>. */
+ recover_io_count(rs, stripe); /* Recovery io accounting. */
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats +
+ (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
+
+ ClearPageError(page);
+ __set_page_locked(page);
+ io_dev_queued(dev);
+ BUG_ON(dm_io(&control, 1, &io, NULL));
+}
+
+/*
+ * Write dirty / read not uptodate page lists of a stripe.
+ */
+static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
+{
+ unsigned r;
+
+ /*
+ * Increment the pending count on the stripe
+ * first, so that we don't race in endio().
+ *
+ * An inc (IO) is needed for any page:
+ *
+ * o not uptodate
+ * o dirtied by writes merged
+ * o dirtied by parity calculations
+ */
+ r = for_each_io_dev(rs, stripe, _stripe_io_inc);
+ if (r) {
+ /* io needed: chunks are not uptodate/dirty. */
+ int max; /* REMOVEME: */
+ struct stripe_cache *sc = &rs->sc;
+
+ if (!TestSetStripeActive(stripe))
+ atomic_inc(&sc->active_stripes);
+
+ /* Take off the lru list in case it got added there. */
+ stripe_lru_del(stripe, LIST_LOCKED);
+
+ /* Submit actual io. */
+ for_each_io_dev(rs, stripe, page_list_rw);
+
+ /* REMOVEME: statistics */
+ max = sc_active(sc);
+ if (atomic_read(&sc->max_active_stripes) < max)
+ atomic_set(&sc->max_active_stripes, max);
+
+ atomic_inc(rs->stats + S_FLUSHS);
+ /* END REMOVEME: statistics */
+ }
+
+ return r;
+}
+
+/* Work in all pending writes. */
+static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
+{
+ struct bio_list *write = BL(stripe, p, WRITE);
+
+ if (!bio_list_empty(write)) {
+ struct page_list *pl = stripe->obj[p].pl;
+ struct bio *bio;
+ struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
+
+ /*
+ * We can play with the lists without holding a lock,
+ * because it is just us accessing them anyway.
+ */
+ bio_list_for_each(bio, write)
+ bio_copy_page_list(WRITE, stripe, pl, bio);
+
+ bio_list_merge(write_merged, write);
+ bio_list_init(write);
+ page_set(pl->page, DIRTY);
+ }
+}
+
+/* Merge in all writes hence dirtying respective pages. */
+static INLINE void writes_merge(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ _writes_merge(stripe, p);
+}
+
+/* Check, if a chunk gets completely overwritten. */
+static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
+{
+ unsigned sectors = 0;
+ struct bio *bio;
+ struct bio_list *bl = BL(stripe, p, WRITE);
+
+ bio_list_for_each(bio, bl)
+ sectors += bio_sectors(bio);
+
+ return sectors == RS(stripe->sc)->set.io_size;
+}
+
+/*
+ * Prepare stripe to avoid io on broken/reconstructed
+ * drive in order to reconstruct date on endio.
+ */
+enum prepare_type { IO_ALLOW, IO_PROHIBIT };
+static void stripe_prepare(struct stripe *stripe, unsigned p,
+ enum prepare_type type)
+{
+ struct page *page = PAGE(stripe, p);
+
+ switch (type) {
+ case IO_PROHIBIT:
+ /*
+ * In case we prohibit, we gotta make sure, that
+ * io on all other chunks than the one which failed
+ * or is being reconstructed is allowed and that it
+ * doesn't have state uptodate.
+ */
+ stripe_allow_io(stripe);
+ ClearPageUptodate(page);
+ ProhibitPageIO(page);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
+ stripe->idx.recover = p;
+ SetStripeReconstruct(stripe);
+ break;
+
+ case IO_ALLOW:
+ AllowPageIO(page);
+ stripe->idx.recover = -1;
+ ClearStripeReconstruct(stripe);
+ break;
+
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Degraded/reconstruction mode.
+ *
+ * Check stripe state to figure which chunks don't need IO.
+ */
+static INLINE void stripe_check_reconstruct(struct stripe *stripe,
+ int prohibited)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ /*
+ * Degraded mode (device(s) failed) ->
+ * avoid io on the failed device.
+ */
+ if (unlikely(raid_set_degraded(rs))) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_DEGRADED);
+ stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
+ return;
+ } else {
+ /*
+ * Reconstruction mode (ie. a particular device or
+ * some (rotating) parity chunk is being resynchronized) ->
+ * o make sure all needed pages are read in
+ * o writes are allowed to go through
+ */
+ int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
+
+ if (r) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NOSYNC);
+ stripe_prepare(stripe, dev_for_parity(stripe),
+ IO_PROHIBIT);
+ return;
+ }
+ }
+
+ /*
+ * All disks good. Avoid reading parity chunk and reconstruct it
+ * unless we have prohibited io to chunk(s).
+ */
+ if (!prohibited) {
+ if (StripeMerged(stripe))
+ stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
+ else {
+ stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
+
+ /*
+ * Overrule stripe_prepare to reconstruct the
+ * parity chunk, because it'll be created new anyway.
+ */
+ ClearStripeReconstruct(stripe);
+ }
+ }
+}
+
+/* Check, if stripe is ready to merge writes. */
+static INLINE int stripe_check_merge(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int prohibited = 0;
+ unsigned chunks = 0, p = rs->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ /* Can't merge active chunks. */
+ if (PageLocked(page)) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
+ break;
+ }
+
+ /* Can merge uptodate chunks and have to count parity chunk. */
+ if (PageUptodate(page) || p == stripe->idx.parity) {
+ chunks++;
+ continue;
+ }
+
+ /* Read before write ordering. */
+ if (RSCheckOverwrite(rs) &&
+ bio_list_empty(BL(stripe, p, READ))) {
+ int r = stripe_check_overwrite(stripe, p);
+
+ if (r) {
+ chunks++;
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats +
+ S_PROHIBITPAGEIO);
+ ProhibitPageIO(page);
+ prohibited = 1;
+ }
+ }
+ }
+
+ if (chunks == rs->set.raid_devs) {
+ /* All pages are uptodate or get written over or mixture. */
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_CAN_MERGE);
+ return 0;
+ } else
+ /* REMOVEME: statistics.*/
+ atomic_inc(rs->stats + S_CANT_MERGE);
+
+ return prohibited ? 1 : -EPERM;
+}
+
+/* Check, if stripe is ready to merge writes. */
+static INLINE int stripe_check_read(struct stripe *stripe)
+{
+ int r = 0;
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct page *page = PAGE(stripe, p);
+
+ if (!PageLocked(page) &&
+ bio_list_empty(BL(stripe, p, READ))) {
+ ProhibitPageIO(page);
+ r = 1;
+ }
+ }
+
+ return r;
+}
+
+/*
+ * Read/write a stripe.
+ *
+ * All stripe read/write activity goes through this function.
+ *
+ * States to cover:
+ * o stripe to read and/or write
+ * o stripe with error to reconstruct
+ */
+static int stripe_rw(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int prohibited = 0, r;
+
+ /*
+ * Check the state of the RAID set and if degraded (or
+ * resynchronizing for reads), read in all other chunks but
+ * the one on the dead/resynchronizing device in order to be
+ * able to reconstruct the missing one.
+ *
+ * Merge all writes hanging off uptodate pages of the stripe.
+ */
+
+ /* Initially allow io on all chunks and prohibit below, if necessary. */
+ stripe_allow_io(stripe);
+
+ if (StripeRBW(stripe)) {
+ r = stripe_check_merge(stripe);
+ if (!r) {
+ /*
+ * If I could rely on valid parity (which would only
+ * be sure in case of a full synchronization),
+ * I could xor a fraction of chunks out of
+ * parity and back in.
+ *
+ * For the time being, I got to redo parity...
+ */
+ /* parity_xor(stripe); */ /* Xor chunks out. */
+ stripe_zero_chunk(stripe, stripe->idx.parity);
+ writes_merge(stripe); /* Merge writes in. */
+ parity_xor(stripe); /* Update parity. */
+ ClearStripeRBW(stripe); /* Disable RBW. */
+ SetStripeMerged(stripe); /* Writes merged. */
+ }
+
+ if (r > 0)
+ prohibited = 1;
+ } else if (!raid_set_degraded(rs))
+ /* Only allow for read avoidance if not degraded. */
+ prohibited = stripe_check_read(stripe);
+
+ /*
+ * Check, if io needs to be allowed/prohibeted on certain chunks
+ * because of a degraded set or reconstruction on a region.
+ */
+ stripe_check_reconstruct(stripe, prohibited);
+
+ /* Now submit any reads/writes. */
+ r = stripe_page_lists_rw(rs, stripe);
+ if (!r) {
+ /*
+ * No io submitted because of chunk io prohibited or
+ * locked pages -> push to end io list for processing.
+ */
+ atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+ stripe_endio_push(stripe);
+ wake_do_raid(rs); /* Wake myself. */
+ }
+
+ return 0;
+}
+
+/* Flush stripe either via flush list or imeediately. */
+enum flush_type { FLUSH_DELAY, FLUSH_NOW };
+static int stripe_flush(struct stripe *stripe, enum flush_type type)
+{
+ int r = 0;
+
+ stripe_lru_del(stripe, LIST_LOCKED);
+
+ /* Immediately flush. */
+ if (type == FLUSH_NOW) {
+ if (likely(raid_set_operational(RS(stripe->sc))))
+ r = stripe_rw(stripe); /* Read/write stripe. */
+ else
+ /* Optimization: Fail early on failed sets. */
+ stripe_fail_io(stripe);
+ /* Delay flush by putting it on io list for later processing. */
+ } else if (type == FLUSH_DELAY)
+ stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
+ else
+ BUG();
+
+ return r;
+}
+
+/*
+ * Queue reads and writes to a stripe by hanging
+ * their bios off the stripsets read/write lists.
+ *
+ * Endio reads on uptodate chunks.
+ */
+static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+ struct bio_list *reject)
+{
+ int r = 0;
+ struct address addr;
+ struct stripe *stripe =
+ stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
+
+ if (stripe) {
+ int rr, rw = bio_data_dir(bio);
+
+ rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
+ if (rr) {
+ stripe_put(stripe);
+ goto out;
+ }
+
+ /* Distinguish read and write cases. */
+ bio_list_add(BL(stripe, addr.di, rw), bio);
+
+ /* REMOVEME: statistics */
+ atomic_inc(rs->stats + (rw == WRITE ?
+ S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
+
+ if (rw == READ)
+ SetStripeRead(stripe);
+ else {
+ SetStripeRBW(stripe);
+
+ /* Inrement pending write count on region. */
+ dm_rh_inc(rs->recover.rh, stripe->region);
+ r = 1; /* Region hash needs a flush. */
+ }
+
+ /*
+ * Optimize stripe flushing:
+ *
+ * o directly start io for read stripes.
+ *
+ * o put stripe onto stripe caches io_list for RBW,
+ * so that do_flush() can belabour it after we put
+ * more bios to the stripe for overwrite optimization.
+ */
+ stripe_flush(stripe,
+ StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
+
+ /* Got no stripe from cache -> reject bio. */
+ } else {
+out:
+ bio_list_add(reject, bio);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_IOS_POST);
+ }
+
+ return r;
+}
+
+/*
+ * Recovery functions
+ */
+/* Read a stripe off a raid set for recovery. */
+static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
+{
+ /* Invalidate all pages so that they get read in. */
+ stripe_pages_invalidate(stripe);
+
+ /* Allow io on all recovery chunks. */
+ stripe_allow_io(stripe);
+
+ if (idx > -1)
+ ProhibitPageIO(PAGE(stripe, idx));
+
+ stripe->key = rs->recover.pos;
+ return stripe_page_lists_rw(rs, stripe);
+}
+
+/* Write a stripe to a raid set for recovery. */
+static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
+{
+ /*
+ * If this is a reconstruct of a particular device, then
+ * reconstruct the respective page(s), else create parity page(s).
+ */
+ if (idx > -1) {
+ struct page *page = PAGE(stripe, idx);
+
+ AllowPageIO(page);
+ stripe_zero_chunk(stripe, idx);
+ common_xor(stripe, stripe->io.size, 0, idx);
+ page_set(page, DIRTY);
+ } else
+ parity_xor(stripe);
+
+ return stripe_page_lists_rw(rs, stripe);
+}
+
+/* Recover bandwidth available ?. */
+static int recover_bandwidth(struct raid_set *rs)
+{
+ int r, work;
+
+ /* On reset -> allow recovery. */
+ r = recover_io_reset(rs);
+ if (r || RSBandwidth(rs))
+ goto out;
+
+ work = atomic_read(rs->recover.io_count + IO_WORK);
+ if (work) {
+ /* Pay attention to larger recover stripe size. */
+ int recover =
+ atomic_read(rs->recover.io_count + IO_RECOVER) *
+ rs->recover.io_size /
+ rs->set.io_size;
+
+ /*
+ * Don't use more than given bandwidth of
+ * the work io for recovery.
+ */
+ if (recover > work / rs->recover.bandwidth_work) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NO_BANDWIDTH);
+ return 0;
+ }
+ }
+
+out:
+ atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
+ return 1;
+}
+
+/* Try to get a region to recover. */
+static int recover_get_region(struct raid_set *rs)
+{
+ struct recover *rec = &rs->recover;
+ struct dm_region_hash *rh = rec->rh;
+
+ /* Start quiescing some regions. */
+ if (!RSRegionGet(rs)) {
+ int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
+
+ if (r) {
+ r = dm_rh_recovery_prepare(rh);
+ if (r < 0) {
+ DMINFO("No %sregions to recover",
+ rec->nr_regions_to_recover ?
+ "more " : "");
+ return -ENOENT;
+ }
+ } else
+ return -EAGAIN;
+
+ SetRSRegionGet(rs);
+ }
+
+ if (!rec->reg) {
+ rec->reg = dm_rh_recovery_start(rh);
+ if (rec->reg) {
+ /*
+ * A reference for the the region I'll
+ * keep till I've completely synced it.
+ */
+ io_get(rs);
+ rec->pos = dm_rh_region_to_sector(rh,
+ dm_rh_get_region_key(rec->reg));
+ rec->end = rec->pos + dm_rh_get_region_size(rh);
+ return 1;
+ } else
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/* Read/write a recovery stripe. */
+static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
+{
+ /* Read/write flip-flop. */
+ if (TestClearStripeRBW(stripe)) {
+ SetStripeRead(stripe);
+ return recover_read(rs, stripe, idx_get(rs));
+ } else if (TestClearStripeRead(stripe))
+ return recover_write(rs, stripe, idx_get(rs));
+
+ return 0;
+}
+
+/* Reset recovery variables. */
+static void recovery_region_reset(struct raid_set *rs)
+{
+ rs->recover.reg = NULL;
+ ClearRSRegionGet(rs);
+}
+
+/* Update region hash state. */
+static void recover_rh_update(struct raid_set *rs, int error)
+{
+ struct recover *rec = &rs->recover;
+ struct dm_region *reg = rec->reg;
+
+ if (reg) {
+ dm_rh_recovery_end(reg, error);
+ if (!error)
+ rec->nr_regions_recovered++;
+
+ recovery_region_reset(rs);
+ }
+
+ dm_rh_update_states(reg->rh, 1);
+ dm_rh_flush(reg->rh);
+ io_put(rs); /* Release the io reference for the region. */
+}
+
+/* Called by main io daemon to recover regions. */
+/* FIXME: cope with MAX_RECOVER > 1. */
+static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
+{
+ int r;
+ struct recover *rec = &rs->recover;
+
+ /* If recovery is active -> return. */
+ if (StripeActive(stripe))
+ return;
+
+ /* io error is fatal for recovery -> stop it. */
+ if (unlikely(StripeError(stripe)))
+ goto err;
+
+ /* Get a region to recover. */
+ r = recover_get_region(rs);
+ switch (r) {
+ case 1: /* Got a new region. */
+ /* Flag read before write. */
+ ClearStripeRead(stripe);
+ SetStripeRBW(stripe);
+ break;
+
+ case 0:
+ /* Got a region in the works. */
+ r = recover_bandwidth(rs);
+ if (r) /* Got enough bandwidth. */
+ break;
+
+ case -EAGAIN:
+ /* No bandwidth/quiesced region yet, try later. */
+ wake_do_raid_delayed(rs, HZ / 10);
+ return;
+
+ case -ENOENT: /* No more regions. */
+ dm_table_event(rs->ti->table);
+ goto free;
+ }
+
+ /* Read/write a recover stripe. */
+ r = recover_stripe_rw(rs, stripe);
+ if (r) {
+ /* IO initiated, get another reference for the IO. */
+ io_get(rs);
+ return;
+ }
+
+ /* Update recovery position within region. */
+ rec->pos += stripe->io.size;
+
+ /* If we're at end of region, update region hash. */
+ if (rec->pos >= rec->end ||
+ rec->pos >= rs->set.sectors_per_dev)
+ recover_rh_update(rs, 0);
+ else
+ SetStripeRBW(stripe);
+
+ /* Schedule myself for another round... */
+ wake_do_raid(rs);
+ return;
+
+err:
+ raid_set_check_degrade(rs, stripe);
+
+ {
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("stopping recovery due to "
+ "ERROR on /dev/%s, stripe at offset %llu",
+ bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+ (unsigned long long) stripe->key);
+
+ }
+
+ /* Make sure, that all quiesced regions get released. */
+ do {
+ if (rec->reg)
+ dm_rh_recovery_end(rec->reg, -EIO);
+
+ rec->reg = dm_rh_recovery_start(rec->rh);
+ } while (rec->reg);
+
+ recover_rh_update(rs, -EIO);
+free:
+ rs->set.dev_to_init = -1;
+
+ /* Check for jiffies overrun. */
+ rs->recover.end_jiffies = jiffies;
+ if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+ rs->recover.end_jiffies = ~0;
+
+ ClearRSRecover(rs);
+}
+
+static INLINE void do_recovery(struct raid_set *rs)
+{
+ struct stripe *stripe;
+
+ list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
+ _do_recovery(rs, stripe);
+
+ if (!RSRecover(rs))
+ stripe_recover_free(rs);
+}
+
+/*
+ * END recovery functions
+ */
+
+/* End io process all stripes handed in by endio() callback. */
+static void do_endios(struct raid_set *rs)
+{
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+
+ while ((stripe = stripe_endio_pop(sc))) {
+ unsigned count;
+
+ /* Recovery stripe special case. */
+ if (unlikely(StripeRecover(stripe))) {
+ if (stripe_io(stripe))
+ continue;
+
+ io_put(rs); /* Release region io reference. */
+ ClearStripeActive(stripe);
+
+ /* REMOVEME: statistics*/
+ atomic_dec(&sc->active_stripes);
+ continue;
+ }
+
+ /* Early end io all reads on any uptodate chunks. */
+ stripe_endio(READ, stripe, (count = 0, &count));
+ if (stripe_io(stripe)) {
+ if (count) /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_ACTIVE_READS);
+
+ continue;
+ }
+
+ /* Set stripe inactive after all io got processed. */
+ if (TestClearStripeActive(stripe))
+ atomic_dec(&sc->active_stripes);
+
+ /* Unlock stripe (for clustering). */
+ stripe_unlock(rs, stripe);
+
+ /*
+ * If an io error on a stripe occured and the RAID set
+ * is still operational, requeue the stripe for io.
+ */
+ if (TestClearStripeError(stripe)) {
+ raid_set_check_degrade(rs, stripe);
+ ClearStripeReconstruct(stripe);
+
+ if (!StripeMerged(stripe) &&
+ raid_set_operational(rs)) {
+ stripe_pages_invalidate(stripe);
+ stripe_flush(stripe, FLUSH_DELAY);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_REQUEUE);
+ continue;
+ }
+ }
+
+ /* Check if the RAID set is inoperational to error ios. */
+ if (!raid_set_operational(rs)) {
+ ClearStripeReconstruct(stripe);
+ stripe_fail_io(stripe);
+ BUG_ON(atomic_read(&stripe->cnt));
+ continue;
+ }
+
+ /* Got to reconstruct a missing chunk. */
+ if (TestClearStripeReconstruct(stripe))
+ reconstruct_xor(stripe);
+
+ /*
+ * Now that we've got a complete stripe, we can
+ * process the rest of the end ios on reads.
+ */
+ BUG_ON(stripe_endio(READ, stripe, NULL));
+ ClearStripeRead(stripe);
+
+ /*
+ * Read-before-write stripes need to be flushed again in
+ * order to work the write data into the pages *after*
+ * they were read in.
+ */
+ if (TestClearStripeMerged(stripe))
+ /* End io all bios which got merged already. */
+ BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
+
+ /* Got to put on flush list because of new writes. */
+ if (StripeRBW(stripe))
+ stripe_flush(stripe, FLUSH_DELAY);
+ }
+}
+
+/*
+ * Stripe cache shrinking.
+ */
+static INLINE void do_sc_shrink(struct raid_set *rs)
+{
+ unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
+
+ if (shrink) {
+ unsigned cur = atomic_read(&rs->sc.stripes);
+
+ sc_shrink(&rs->sc, shrink);
+ shrink -= cur - atomic_read(&rs->sc.stripes);
+ atomic_set(&rs->sc.stripes_to_shrink, shrink);
+
+ /*
+ * Wake myself up in case we failed to shrink the
+ * requested amount in order to try again later.
+ */
+ if (shrink)
+ wake_do_raid(rs);
+ }
+}
+
+
+/*
+ * Process all ios
+ *
+ * We do different things with the io depending on the
+ * state of the region that it's in:
+ *
+ * o reads: hang off stripe cache or postpone if full
+ *
+ * o writes:
+ *
+ * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
+ * In case stripe cache is full or busy, postpone the io.
+ *
+ * RECOVERING: delay the io until recovery of the region completes.
+ *
+ */
+static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
+{
+ int r;
+ unsigned flush = 0;
+ struct dm_region_hash *rh = rs->recover.rh;
+ struct bio *bio;
+ struct bio_list delay, reject;
+
+ bio_list_init(&delay);
+ bio_list_init(&reject);
+
+ /*
+ * Classify each io:
+ * o delay to recovering regions
+ * o queue to all other regions
+ */
+ while ((bio = bio_list_pop(ios))) {
+ /*
+ * In case we get a barrier bio, push it back onto
+ * the input queue unless all work queues are empty
+ * and the stripe cache is inactive.
+ */
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BARRIER);
+ if (!list_empty(rs->sc.lists + LIST_IO) ||
+ !bio_list_empty(&delay) ||
+ !bio_list_empty(&reject) ||
+ sc_active(&rs->sc)) {
+ bio_list_push(ios, bio);
+ break;
+ }
+ }
+
+ r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
+ if (unlikely(r)) {
+ /* Got to wait for recovering regions. */
+ bio_list_add(&delay, bio);
+ SetRSBandwidth(rs);
+ } else {
+ /*
+ * Process ios to non-recovering regions by queueing
+ * them to stripes (does rh_inc()) for writes).
+ */
+ flush += stripe_queue_bio(rs, bio, &reject);
+ }
+ }
+
+ if (flush) {
+ r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+ if (r)
+ DMERR("dirty log flush");
+ }
+
+ /* Delay ios to regions which are recovering. */
+ while ((bio = bio_list_pop(&delay))) {
+ /* REMOVEME: statistics.*/
+ atomic_inc(rs->stats + S_DELAYED_BIOS);
+ atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+ dm_rh_delay(rh, bio);
+
+ }
+
+ /* Merge any rejected bios back to the head of the input list. */
+ bio_list_merge_head(ios, &reject);
+}
+
+/* Flush any stripes on the io list. */
+static INLINE void do_flush(struct raid_set *rs)
+{
+ struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
+
+ list_for_each_safe(pos, tmp, list) {
+ int r = stripe_flush(list_entry(pos, struct stripe,
+ lists[LIST_IO]), FLUSH_NOW);
+
+ /* Remove from the list only if the stripe got processed. */
+ if (!r)
+ list_del_init(pos);
+ }
+}
+
+/* Send an event in case we're getting too busy. */
+static INLINE void do_busy_event(struct raid_set *rs)
+{
+ if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
+ if (!TestSetRSScBusy(rs))
+ dm_table_event(rs->ti->table);
+ } else
+ ClearRSScBusy(rs);
+}
+
+/* Unplug: let the io role on the sets devices. */
+static INLINE void do_unplug(struct raid_set *rs)
+{
+ struct raid_dev *dev = rs->dev + rs->set.raid_devs;
+
+ while (dev-- > rs->dev) {
+ /* Only call any device unplug function, if io got queued. */
+ if (io_dev_clear(dev))
+ blk_unplug(bdev_get_queue(dev->dev->bdev));
+ }
+}
+
+/*-----------------------------------------------------------------
+ * RAID daemon
+ *---------------------------------------------------------------*/
+/*
+ * o belabour all end ios
+ * o optionally shrink the stripe cache
+ * o update the region hash states
+ * o optionally do recovery
+ * o grab the input queue
+ * o work an all requeued or new ios and perform stripe cache flushs
+ * unless the RAID set is inoperational (when we error ios)
+ * o check, if the stripe cache gets too busy and throw an event if so
+ * o unplug any component raid devices with queued bios
+ */
+static void do_raid(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
+ struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+ spinlock_t *lock = &rs->io.in_lock;
+
+ /*
+ * We always need to end io, so that ios
+ * can get errored in case the set failed
+ * and the region counters get decremented
+ * before we update the region hash states.
+ */
+redo:
+ do_endios(rs);
+
+ /*
+ * Now that we've end io'd, which may have put stripes on
+ * the LRU list, we shrink the stripe cache if requested.
+ */
+ do_sc_shrink(rs);
+
+ /* Update region hash states before we go any further. */
+ dm_rh_update_states(rs->recover.rh, 1);
+
+ /* Try to recover regions. */
+ if (RSRecover(rs))
+ do_recovery(rs);
+
+ /* More endios -> process. */
+ if (!stripe_endio_empty(&rs->sc)) {
+ atomic_inc(rs->stats + S_REDO);
+ goto redo;
+ }
+
+ /* Quickly grab all new ios queued and add them to the work list. */
+ spin_lock_irq(lock);
+ bio_list_merge(ios, ios_in);
+ bio_list_init(ios_in);
+ spin_unlock_irq(lock);
+
+ /* Let's assume we're operational most of the time ;-). */
+ if (likely(raid_set_operational(rs))) {
+ /* If we got ios, work them into the cache. */
+ if (!bio_list_empty(ios)) {
+ do_ios(rs, ios);
+ do_unplug(rs); /* Unplug the sets device queues. */
+ }
+
+ do_flush(rs); /* Flush any stripes on io list. */
+ do_unplug(rs); /* Unplug the sets device queues. */
+ do_busy_event(rs); /* Check if we got too busy. */
+
+ /* More endios -> process. */
+ if (!stripe_endio_empty(&rs->sc)) {
+ atomic_inc(rs->stats + S_REDO);
+ goto redo;
+ }
+ } else
+ /* No way to reconstruct data with too many devices failed. */
+ bio_list_fail(rs, NULL, ios);
+}
+
+/*
+ * Callback for region hash to dispatch
+ * delayed bios queued to recovered regions
+ * (Gets called via rh_update_states()).
+ */
+static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+{
+ struct raid_set *rs = context;
+ struct bio *bio;
+
+ /* REMOVEME: decrement pending delayed bios counter. */
+ bio_list_for_each(bio, bl)
+ atomic_dec(rs->stats + S_DELAYED_BIOS);
+
+ /* Merge region hash private list to work list. */
+ bio_list_merge_head(&rs->io.work, bl);
+ bio_list_init(bl);
+ ClearRSBandwidth(rs);
+}
+
+/*************************************************************
+ * Constructor helpers
+ *************************************************************/
+/* Calculate MB/sec. */
+static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
+{
+ return to_bytes(speed * rs->set.data_devs *
+ rs->recover.io_size * HZ >> 10) >> 10;
+}
+
+/*
+ * Discover fastest xor algorithm and # of chunks combination.
+ */
+/* Calculate speed for algorithm and # of chunks. */
+static INLINE unsigned xor_speed(struct stripe *stripe)
+{
+ unsigned r = 0;
+ unsigned long j;
+
+ /* Wait for next tick. */
+ for (j = jiffies; j == jiffies;)
+ ;
+
+ /* Do xors for a full tick. */
+ for (j = jiffies; j == jiffies;) {
+ mb();
+ common_xor(stripe, stripe->io.size, 0, 0);
+ mb();
+ r++;
+ mb();
+ }
+
+ return r;
+}
+
+/* Optimize xor algorithm for this RAID set. */
+static unsigned xor_optimize(struct raid_set *rs)
+{
+ unsigned chunks_max = 2, speed_max = 0;
+ struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+ struct stripe *stripe;
+
+ BUG_ON(list_empty(&rs->recover.stripes));
+ stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+ lists[LIST_RECOVER]);
+
+ /*
+ * Got to allow io on all chunks, so that
+ * xor() will actually work on them.
+ */
+ stripe_allow_io(stripe);
+
+ /* Try all xor functions. */
+ while (f-- > xor_funcs) {
+ unsigned speed;
+
+ /* Set actual xor function for common_xor(). */
+ rs->xor.f = f;
+ rs->xor.chunks = XOR_CHUNKS_MAX + 1;
+
+ while (rs->xor.chunks-- > 2) {
+ speed = xor_speed(stripe);
+ if (speed > speed_max) {
+ speed_max = speed;
+ chunks_max = rs->xor.chunks;
+ f_max = f;
+ }
+ }
+ }
+
+ /* Memorize optimum parameters. */
+ rs->xor.f = f_max;
+ rs->xor.chunks = chunks_max;
+ return speed_max;
+}
+
+static inline int array_too_big(unsigned long fixed, unsigned long obj,
+ unsigned long num)
+{
+ return (num > (ULONG_MAX - fixed) / obj);
+}
+
+static void wakeup_all_recovery_waiters(void *context)
+{
+}
+
+/*
+ * Allocate a RAID context (a RAID set)
+ */
+static int
+context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
+ unsigned stripes, unsigned chunk_size, unsigned io_size,
+ unsigned recover_io_size, unsigned raid_devs,
+ sector_t sectors_per_dev,
+ struct dm_target *ti, unsigned dl_parms, char **argv)
+{
+ int r;
+ unsigned p;
+ size_t len;
+ sector_t region_size, ti_len;
+ struct raid_set *rs = NULL;
+ struct dm_dirty_log *dl;
+ struct recover *rec;
+
+ /*
+ * Create the dirty log
+ *
+ * We need to change length for the dirty log constructor,
+ * because we want an amount of regions for all stripes derived
+ * from the single device size, so that we can keep region
+ * size = 2^^n independant of the number of devices
+ */
+ ti_len = ti->len;
+ ti->len = sectors_per_dev;
+ dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
+ ti->len = ti_len;
+ if (!dl)
+ goto bad_dirty_log;
+
+ /* Chunk size *must* be smaller than region size. */
+ region_size = dl->type->get_region_size(dl);
+ if (chunk_size > region_size)
+ goto bad_chunk_size;
+
+ /* Recover io size *must* be smaller than region size as well. */
+ if (recover_io_size > region_size)
+ goto bad_recover_io_size;
+
+ /* Size and allocate the RAID set structure. */
+ len = sizeof(*rs->data) + sizeof(*rs->dev);
+ if (array_too_big(sizeof(*rs), len, raid_devs))
+ goto bad_array;
+
+ len = sizeof(*rs) + raid_devs * len;
+ rs = kzalloc(len, GFP_KERNEL);
+ if (!rs)
+ goto bad_alloc;
+
+ rec = &rs->recover;
+ atomic_set(&rs->io.in_process, 0);
+ atomic_set(&rs->io.in_process_max, 0);
+ rec->io_size = recover_io_size;
+
+ /* Pointer to data array. */
+ rs->data = (unsigned long **)
+ ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+ rec->dl = dl;
+ rs->set.raid_devs = p = raid_devs;
+ rs->set.data_devs = raid_devs - raid_type->parity_devs;
+ rs->set.raid_type = raid_type;
+
+ /*
+ * Set chunk and io size and respective shifts
+ * (used to avoid divisions)
+ */
+ rs->set.chunk_size = chunk_size;
+ rs->set.chunk_mask = chunk_size - 1;
+ rs->set.chunk_shift = ffs(chunk_size) - 1;
+
+ rs->set.io_size = io_size;
+ rs->set.io_mask = io_size - 1;
+ rs->set.io_shift = ffs(io_size) - 1;
+ rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
+
+ rs->set.pages_per_io = chunk_pages(io_size);
+ rs->set.sectors_per_dev = sectors_per_dev;
+
+ rs->set.ei = -1; /* Indicate no failed device. */
+ atomic_set(&rs->set.failed_devs, 0);
+
+ rs->ti = ti;
+
+ atomic_set(rec->io_count + IO_WORK, 0);
+ atomic_set(rec->io_count + IO_RECOVER, 0);
+
+ /* Initialize io lock and queues. */
+ spin_lock_init(&rs->io.in_lock);
+ bio_list_init(&rs->io.in);
+ bio_list_init(&rs->io.work);
+
+ init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
+
+ rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+
+ rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, wake_do_raid,
+ wakeup_all_recovery_waiters,
+ rs->ti->begin, MAX_RECOVER, dl,
+ region_size, rs->recover.nr_regions);
+ if (IS_ERR(rec->rh))
+ goto bad_rh;
+
+ /* Initialize stripe cache. */
+ r = sc_init(rs, stripes);
+ if (r)
+ goto bad_sc;
+
+ /* Create dm-io client context. */
+ rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
+ rs->set.pages_per_io);
+ if (IS_ERR(rs->sc.dm_io_client))
+ goto bad_dm_io_client;
+
+ /* REMOVEME: statistics. */
+ stats_reset(rs);
+ ClearRSDevelStats(rs); /* Disnable development status. */
+
+ *raid_set = rs;
+ return 0;
+
+bad_dirty_log:
+ TI_ERR_RET("Error creating dirty log", -ENOMEM);
+
+
+bad_chunk_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Chunk size larger than region size");
+
+bad_recover_io_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Recover stripe io size larger than region size");
+
+bad_array:
+ dm_dirty_log_destroy(dl);
+ TI_ERR("Arry too big");
+
+bad_alloc:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
+
+bad_rh:
+ dm_dirty_log_destroy(dl);
+ ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+ goto free_rs;
+
+bad_sc:
+ ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+ goto free;
+
+bad_dm_io_client:
+ ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
+free:
+ dm_region_hash_destroy(rec->rh);
+ sc_exit(&rs->sc);
+ dm_region_hash_destroy(rec->rh); /* Destroys dirty log as well. */
+free_rs:
+ kfree(rs);
+ return -ENOMEM;
+}
+
+/* Free a RAID context (a RAID set). */
+static void
+context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
+{
+ while (r--)
+ dm_put_device(ti, rs->dev[r].dev);
+
+ dm_io_client_destroy(rs->sc.dm_io_client);
+ sc_exit(&rs->sc);
+ dm_region_hash_destroy(rs->recover.rh);
+ dm_dirty_log_destroy(rs->recover.dl);
+ kfree(rs);
+}
+
+/* Create work queue and initialize work. */
+static int rs_workqueue_init(struct raid_set *rs)
+{
+ struct dm_target *ti = rs->ti;
+
+ rs->io.wq = create_singlethread_workqueue(DAEMON);
+ if (!rs->io.wq)
+ TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+
+ INIT_DELAYED_WORK(&rs->io.dws, do_raid);
+ return 0;
+}
+
+/* Return pointer to raid_type structure for raid name. */
+static struct raid_type *get_raid_type(char *name)
+{
+ struct raid_type *r = ARRAY_END(raid_types);
+
+ while (r-- > raid_types) {
+ if (!strnicmp(STR_LEN(r->name, name)))
+ return r;
+ }
+
+ return NULL;
+}
+
+/* FIXME: factor out to dm core. */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+ sector_t r = a;
+
+ sector_div(r, b);
+ *n = r;
+ return a == r * b;
+}
+
+/* Log RAID set information to kernel log. */
+static void raid_set_log(struct raid_set *rs, unsigned speed)
+{
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ for (p = 0; p < rs->set.raid_devs; p++)
+ DMINFO("/dev/%s is raid disk %u",
+ bdevname(rs->dev[p].dev->bdev, buf), p);
+
+ DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
+ rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+ atomic_read(&rs->sc.stripes));
+ DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
+ rs->xor.chunks, mbpers(rs, speed));
+ DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
+ rs->set.data_devs, rs->set.raid_devs);
+}
+
+/* Get all devices and offsets. */
+static int
+dev_parms(struct dm_target *ti, struct raid_set *rs,
+ char **argv, int *p)
+{
+ for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+ int r;
+ unsigned long long tmp;
+ struct raid_dev *dev = rs->dev + *p;
+ union dev_lookup dl = {.dev = dev };
+
+ /* Get offset and device. */
+ r = sscanf(argv[1], "%llu", &tmp);
+ if (r != 1)
+ TI_ERR("Invalid RAID device offset parameter");
+
+ dev->start = tmp;
+ r = dm_get_device(ti, argv[0], dev->start,
+ rs->set.sectors_per_dev,
+ dm_table_get_mode(ti->table), &dev->dev);
+ if (r)
+ TI_ERR_RET("RAID device lookup failure", r);
+
+ r = raid_dev_lookup(rs, bynumber, &dl);
+ if (r != -ENODEV && r < *p) {
+ (*p)++; /* Ensure dm_put_device() on actual device. */
+ TI_ERR_RET("Duplicate RAID device", -ENXIO);
+ }
+ }
+
+ return 0;
+}
+
+/* Set recovery bandwidth. */
+static INLINE void
+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+{
+ rs->recover.bandwidth = bandwidth;
+ rs->recover.bandwidth_work = 100 / bandwidth;
+}
+
+/* Handle variable number of RAID parameters. */
+static int
+raid_variable_parms(struct dm_target *ti, char **argv,
+ unsigned i, int *raid_parms,
+ int *chunk_size, int *chunk_size_parm,
+ int *stripes, int *stripes_parm,
+ int *io_size, int *io_size_parm,
+ int *recover_io_size, int *recover_io_size_parm,
+ int *bandwidth, int *bandwidth_parm)
+{
+ /* Fetch # of variable raid parameters. */
+ if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
+ !range_ok(*raid_parms, 0, 5))
+ TI_ERR("Bad variable raid parameters number");
+
+ if (*raid_parms) {
+ /*
+ * If we've got variable RAID parameters,
+ * chunk size is the first one
+ */
+ if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
+ (*chunk_size != -1 &&
+ (!POWER_OF_2(*chunk_size) ||
+ !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
+ TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
+
+ *chunk_size_parm = *chunk_size;
+ if (*chunk_size == -1)
+ *chunk_size = CHUNK_SIZE;
+
+ /*
+ * In case we've got 2 or more variable raid
+ * parameters, the number of stripes is the second one
+ */
+ if (*raid_parms > 1) {
+ if (sscanf(argv[i++], "%d", stripes) != 1 ||
+ (*stripes != -1 &&
+ !range_ok(*stripes, STRIPES_MIN,
+ STRIPES_MAX)))
+ TI_ERR("Invalid number of stripes: must "
+ "be >= 8 and <= 8192");
+ }
+
+ *stripes_parm = *stripes;
+ if (*stripes == -1)
+ *stripes = STRIPES;
+
+ /*
+ * In case we've got 3 or more variable raid
+ * parameters, the io size is the third one.
+ */
+ if (*raid_parms > 2) {
+ if (sscanf(argv[i++], "%d", io_size) != 1 ||
+ (*io_size != -1 &&
+ (!POWER_OF_2(*io_size) ||
+ !range_ok(*io_size, IO_SIZE_MIN,
+ min(BIO_MAX_SECTORS / 2,
+ *chunk_size)))))
+ TI_ERR("Invalid io size; must "
+ "be 2^^n and less equal "
+ "min(BIO_MAX_SECTORS/2, chunk size)");
+ } else
+ *io_size = *chunk_size;
+
+ *io_size_parm = *io_size;
+ if (*io_size == -1)
+ *io_size = *chunk_size;
+
+ /*
+ * In case we've got 4 variable raid parameters,
+ * the recovery stripe io_size is the fourth one
+ */
+ if (*raid_parms > 3) {
+ if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
+ (*recover_io_size != -1 &&
+ (!POWER_OF_2(*recover_io_size) ||
+ !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
+ BIO_MAX_SECTORS / 2))))
+ TI_ERR("Invalid recovery io size; must be "
+ "2^^n and less equal BIO_MAX_SECTORS/2");
+ }
+
+ *recover_io_size_parm = *recover_io_size;
+ if (*recover_io_size == -1)
+ *recover_io_size = RECOVER_IO_SIZE;
+
+ /*
+ * In case we've got 5 variable raid parameters,
+ * the recovery io bandwidth is the fifth one
+ */
+ if (*raid_parms > 4) {
+ if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
+ (*bandwidth != -1 &&
+ !range_ok(*bandwidth, BANDWIDTH_MIN,
+ BANDWIDTH_MAX)))
+ TI_ERR("Invalid recovery bandwidth "
+ "percentage; must be > 0 and <= 100");
+ }
+
+ *bandwidth_parm = *bandwidth;
+ if (*bandwidth == -1)
+ *bandwidth = BANDWIDTH;
+ }
+
+ return 0;
+}
+
+/* Parse optional locking parameters. */
+static int
+raid_locking_parms(struct dm_target *ti, char **argv,
+ unsigned i, int *locking_parms,
+ struct dm_raid45_locking_type **locking_type)
+{
+ *locking_parms = 0;
+ *locking_type = &locking_none;
+
+ if (!strnicmp(argv[i], "none", strlen(argv[i])))
+ *locking_parms = 1;
+ else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
+ *locking_type = &locking_none;
+ *locking_parms = 2;
+ } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
+ *locking_type = &locking_cluster;
+ /* FIXME: namespace. */
+ *locking_parms = 3;
+ }
+
+ return *locking_parms == 1 ? -EINVAL : 0;
+}
+
+/* Set backing device information properties of RAID set. */
+static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
+{
+ unsigned p, ra_pages;
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+ /* Set read-ahead for the RAID set and the component devices. */
+ bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
+ ra_pages = chunks * chunk_pages(rs->set.io_size);
+ for (p = rs->set.raid_devs; p--; ) {
+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+ q->backing_dev_info.ra_pages = ra_pages;
+ }
+
+ /* Set congested function and data. */
+ bdi->congested_fn = raid_set_congested;
+ bdi->congested_data = rs;
+
+ dm_put(md);
+}
+
+/* Get backing device information properties of RAID set. */
+static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
+{
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+
+ *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
+ / stripe_pages(rs, rs->set.io_size);
+ *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
+ / chunk_pages(rs->set.io_size);
+
+ dm_put(md);
+}
+
+/*
+ * Construct a RAID4/5 mapping:
+ *
+ * log_type #log_params <log_params> \
+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ * [locking "none"/"cluster"]
+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ *
+ * log_type = "core"/"disk",
+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ * log_params = [dirty_log_path] region_size [[no]sync])
+ *
+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ *
+ * #parity_dev = N if raid_type = "raid4"
+ * o N = -1: pick default = last device
+ * o N >= 0 and < #raid_devs: parity device index
+ *
+ * #raid_variable_params = 0-5; raid_params (-1 = default):
+ * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
+ * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ * and <= CHUNK_SIZE_MAX)
+ * o #stripes is number of stripes allocated to stripe cache
+ * (must be > 1 and < STRIPES_MAX)
+ * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ * o recover_io_size (io unit size per device for recovery in sectors;
+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ * o %recovery_bandwith is the maximum amount spend for recovery during
+ * application io (1-100%)
+ * If raid_variable_params = 0, defaults will be used.
+ * Any raid_variable_param can be set to -1 to apply a default
+ *
+ * #raid_devs = N (N >= 3)
+ *
+ * #dev_to_initialize = N
+ * -1: initialize parity on all devices
+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ * of a failed devices content after replacement
+ *
+ * <dev_path> = device_path (eg, /dev/sdd1)
+ * <offset> = begin at offset on <dev_path>
+ *
+ */
+#define MIN_PARMS 13
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int bandwidth = BANDWIDTH, bandwidth_parm = -1,
+ chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
+ dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
+ i, io_size = IO_SIZE, io_size_parm = -1,
+ r, raid_devs, raid_parms,
+ recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
+ stripes = STRIPES, stripes_parm = -1;
+ unsigned speed;
+ sector_t tmp, sectors_per_dev;
+ struct dm_raid45_locking_type *locking;
+ struct raid_set *rs;
+ struct raid_type *raid_type;
+
+ /* Ensure minimum number of parameters. */
+ if (argc < MIN_PARMS)
+ TI_ERR("Not enough parameters");
+
+ /* Fetch # of dirty log parameters. */
+ if (sscanf(argv[1], "%d", &dl_parms) != 1
+ || !range_ok(dl_parms, 1, 4711))
+ TI_ERR("Bad dirty log parameters number");
+
+ /* Check raid_type. */
+ raid_type = get_raid_type(argv[dl_parms + 2]);
+ if (!raid_type)
+ TI_ERR("Bad raid type");
+
+ /* In case of RAID4, parity drive is selectable. */
+ parity_parm = !!(raid_type->level == raid4);
+
+ /* Handle variable number of RAID parameters. */
+ r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
+ &raid_parms,
+ &chunk_size, &chunk_size_parm,
+ &stripes, &stripes_parm,
+ &io_size, &io_size_parm,
+ &recover_io_size, &recover_io_size_parm,
+ &bandwidth, &bandwidth_parm);
+ if (r)
+ return r;
+
+ r = raid_locking_parms(ti, argv,
+ dl_parms + parity_parm + raid_parms + 4,
+ &locking_parms, &locking);
+ if (r)
+ return r;
+
+ /* # of raid devices. */
+ i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
+ if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+ raid_devs < raid_type->minimal_devs)
+ TI_ERR("Invalid number of raid devices");
+
+ /* In case of RAID4, check parity drive index is in limits. */
+ if (raid_type->level == raid4) {
+ /* Fetch index of parity device. */
+ if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+ !range_ok(pi, 0, raid_devs - 1))
+ TI_ERR("Invalid RAID4 parity device index");
+ }
+
+ /*
+ * Index of device to initialize starts at 0
+ *
+ * o -1 -> don't initialize a particular device,
+ * o 0..raid_devs-1 -> initialize respective device
+ * (used for reconstruction of a replaced device)
+ */
+ if (sscanf
+ (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
+ "%d", &dev_to_init) != 1
+ || !range_ok(dev_to_init, -1, raid_devs - 1))
+ TI_ERR("Invalid number for raid device to initialize");
+
+ /* Check # of raid device arguments. */
+ if (argc - dl_parms - parity_parm - raid_parms - 6 !=
+ 2 * raid_devs)
+ TI_ERR("Wrong number of raid device/offset arguments");
+
+ /*
+ * Check that the table length is devisable
+ * w/o rest by (raid_devs - parity_devs)
+ */
+ if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+ &sectors_per_dev))
+ TI_ERR
+ ("Target length not divisable by number of data devices");
+
+ /*
+ * Check that the device size is
+ * devisable w/o rest by chunk size
+ */
+ if (!multiple(sectors_per_dev, chunk_size, &tmp))
+ TI_ERR("Device length not divisable by chunk_size");
+
+ /****************************************************************
+ * Now that we checked the constructor arguments ->
+ * let's allocate the RAID set
+ ****************************************************************/
+ r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
+ recover_io_size, raid_devs, sectors_per_dev,
+ ti, dl_parms, argv);
+ if (r)
+ return r;
+
+ /*
+ * Set these here in order to avoid passing
+ * too many arguments to context_alloc()
+ */
+ rs->set.dev_to_init_parm = dev_to_init;
+ rs->set.dev_to_init = dev_to_init;
+ rs->set.pi_parm = pi;
+ rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+ rs->set.raid_parms = raid_parms;
+ rs->set.chunk_size_parm = chunk_size_parm;
+ rs->set.io_size_parm = io_size_parm;
+ rs->sc.stripes_parm = stripes_parm;
+ rs->recover.io_size_parm = recover_io_size_parm;
+ rs->recover.bandwidth_parm = bandwidth_parm;
+ recover_set_bandwidth(rs, bandwidth);
+
+ /* Use locking type to lock stripe access. */
+ rs->locking = locking;
+
+ /* Get the device/offset tupels. */
+ argv += dl_parms + 6 + parity_parm + raid_parms;
+ r = dev_parms(ti, rs, argv, &i);
+ if (r)
+ goto err;
+
+ /* Initialize recovery. */
+ rs->recover.start_jiffies = jiffies;
+ rs->recover.end_jiffies = 0;
+ recovery_region_reset(rs);
+
+ /* Allow for recovery of any nosync regions. */
+ SetRSRecover(rs);
+
+ /* Set backing device information (eg. read ahead). */
+ rs_set_bdi(rs, chunk_size * 2, io_size * 4);
+ SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+
+ speed = xor_optimize(rs); /* Select best xor algorithm. */
+
+ /* Initialize work queue to handle this RAID set's io. */
+ r = rs_workqueue_init(rs);
+ if (r)
+ goto err;
+
+ raid_set_log(rs, speed); /* Log information about RAID set. */
+
+ /*
+ * Make sure that dm core only hands maximum io size
+ * length down and pays attention to io boundaries.
+ */
+ ti->split_io = rs->set.io_size;
+ ti->private = rs;
+ return 0;
+
+err:
+ context_free(rs, ti, i);
+ return r;
+}
+
+/*
+ * Destruct a raid mapping
+ */
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ /* Indicate recovery end so that ios in flight drain. */
+ ClearRSRecover(rs);
+
+ wake_do_raid(rs); /* Wake daemon. */
+ wait_ios(rs); /* Wait for any io still being processed. */
+ destroy_workqueue(rs->io.wq);
+ context_free(rs, ti, rs->set.raid_devs);
+}
+
+/* Queues ios to RAID sets. */
+static inline void queue_bio(struct raid_set *rs, struct bio *bio)
+{
+ int wake;
+ struct bio_list *in = &rs->io.in;
+ spinlock_t *in_lock = &rs->io.in_lock;
+
+ spin_lock_irq(in_lock);
+ wake = bio_list_empty(in);
+ bio_list_add(in, bio);
+ spin_unlock_irq(in_lock);
+
+ /* Wake daemon if input list was empty. */
+ if (wake)
+ wake_do_raid(rs);
+}
+
+/* Raid mapping function. */
+static int raid_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ /* I don't want to waste stripe cache capacity. */
+ if (bio_rw(bio) == READA)
+ return -EIO;
+ else {
+ struct raid_set *rs = ti->private;
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats +
+ (bio_data_dir(bio) == WRITE ?
+ S_BIOS_WRITE : S_BIOS_READ));
+
+ /*
+ * Get io reference to be waiting for to drop
+ * to zero on device suspension/destruction.
+ */
+ io_get(rs);
+ bio->bi_sector -= ti->begin; /* Remap sector. */
+ queue_bio(rs, bio); /* Queue to the daemon. */
+ return DM_MAPIO_SUBMITTED; /* Handle later. */
+ }
+}
+
+/* Device suspend. */
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct dm_dirty_log *dl = rs->recover.dl;
+
+ SetRSSuspended(rs);
+
+ if (RSRecover(rs))
+ dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
+ else
+ wake_do_raid(rs);
+
+ wait_ios(rs); /* Wait for completion of all ios being processed. */
+ if (dl->type->postsuspend && dl->type->postsuspend(dl))
+ /* Suspend dirty log. */
+ /* FIXME: need better error handling. */
+ DMWARN("log suspend failed");
+}
+
+/* Device resume. */
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct dm_dirty_log *dl = rec->dl;
+
+ if (dl->type->resume && dl->type->resume(dl))
+ /* Resume dirty log. */
+ /* FIXME: need better error handling. */
+ DMWARN("log resume failed");
+
+ rec->nr_regions_to_recover =
+ rec->nr_regions - dl->type->get_sync_count(dl);
+
+ ClearRSSuspended(rs);
+
+ /* Reset any unfinished recovery. */
+ if (RSRecover(rs)) {
+ recovery_region_reset(rs);
+ dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
+ } else
+ wake_do_raid(rs);
+}
+
+static INLINE unsigned sc_size(struct raid_set *rs)
+{
+ return to_sector(atomic_read(&rs->sc.stripes) *
+ (sizeof(struct stripe) +
+ (sizeof(struct stripe_set) +
+ (sizeof(struct page_list) +
+ to_bytes(rs->set.io_size) *
+ rs->set.raid_devs)) +
+ (rs->recover.
+ end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
+ rs->recover.
+ io_size))));
+}
+
+/* REMOVEME: status output for development. */
+static void
+raid_devel_stats(struct dm_target *ti, char *result,
+ unsigned *size, unsigned maxlen)
+{
+ unsigned chunks, stripes, sz = *size;
+ unsigned long j;
+ char buf[BDEVNAME_SIZE], *p;
+ struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct timespec ts;
+
+ DMEMIT("%s ", version);
+ DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
+ DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
+
+ for (sm = stats_map; sm < sm_end; sm++)
+ DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+
+ DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
+ DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
+ atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
+ sc_size(rs));
+
+ j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+ rec->start_jiffies;
+ jiffies_to_timespec(j, &ts);
+ sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ p = strchr(buf, '.');
+ p[3] = 0;
+
+ DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
+ (unsigned long long) rec->nr_regions_recovered,
+ RSRegionGet(rs) ? "+" : "",
+ (unsigned long long) rec->nr_regions_to_recover,
+ (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+
+ rs_get_ra(rs, &stripes, &chunks);
+ DMEMIT("ra=%u/%u ", stripes, chunks);
+
+ *size = sz;
+}
+
+static int
+raid_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ unsigned i, sz = 0;
+ char buf[BDEVNAME_SIZE];
+ struct raid_set *rs = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ /* REMOVEME: statistics. */
+ if (RSDevelStats(rs))
+ raid_devel_stats(ti, result, &sz, maxlen);
+
+ DMEMIT("%u ", rs->set.raid_devs);
+
+ for (i = 0; i < rs->set.raid_devs; i++)
+ DMEMIT("%s ",
+ format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
+
+ DMEMIT("1 ");
+ for (i = 0; i < rs->set.raid_devs; i++) {
+ DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
+
+ if (rs->set.raid_type->level == raid4 &&
+ i == rs->set.pi)
+ DMEMIT("p");
+
+ if (rs->set.dev_to_init == i)
+ DMEMIT("i");
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ sz = rs->recover.dl->type->status(rs->recover.dl, type,
+ result, maxlen);
+ DMEMIT("%s %u ", rs->set.raid_type->name,
+ rs->set.raid_parms);
+
+ if (rs->set.raid_type->level == raid4)
+ DMEMIT("%d ", rs->set.pi_parm);
+
+ if (rs->set.raid_parms)
+ DMEMIT("%d ", rs->set.chunk_size_parm);
+
+ if (rs->set.raid_parms > 1)
+ DMEMIT("%d ", rs->sc.stripes_parm);
+
+ if (rs->set.raid_parms > 2)
+ DMEMIT("%d ", rs->set.io_size_parm);
+
+ if (rs->set.raid_parms > 3)
+ DMEMIT("%d ", rs->recover.io_size_parm);
+
+ if (rs->set.raid_parms > 4)
+ DMEMIT("%d ", rs->recover.bandwidth_parm);
+
+ DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+
+ for (i = 0; i < rs->set.raid_devs; i++)
+ DMEMIT("%s %llu ",
+ format_dev_t(buf,
+ rs->dev[i].dev->bdev->bd_dev),
+ (unsigned long long) rs->dev[i].start);
+ }
+
+ return 0;
+}
+
+/*
+ * Message interface
+ */
+enum raid_msg_actions {
+ act_bw, /* Recovery bandwidth switch. */
+ act_dev, /* Device failure switch. */
+ act_overwrite, /* Stripe overwrite check. */
+ act_read_ahead, /* Set read ahead. */
+ act_stats, /* Development statistics switch. */
+ act_sc, /* Stripe cache switch. */
+
+ act_on, /* Set entity on. */
+ act_off, /* Set entity off. */
+ act_reset, /* Reset entity. */
+
+ act_set = act_on, /* Set # absolute. */
+ act_grow = act_off, /* Grow # by an amount. */
+ act_shrink = act_reset, /* Shrink # by an amount. */
+};
+
+/* Turn a delta to absolute. */
+static int _absolute(unsigned long action, int act, int r)
+{
+ /* Make delta absolute. */
+ if (test_bit(act_set, &action))
+ ;
+ else if (test_bit(act_grow, &action))
+ r += act;
+ else if (test_bit(act_shrink, &action))
+ r = act - r;
+ else
+ r = -EINVAL;
+
+ return r;
+}
+
+ /* Change recovery io bandwidth. */
+static int bandwidth_change(struct dm_msg *msg, void *context)
+{
+ struct raid_set *rs = context;
+ int act = rs->recover.bandwidth;
+ int bandwidth = DM_MSG_INT_ARG(msg);
+
+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ /* Make delta bandwidth absolute. */
+ bandwidth = _absolute(msg->action, act, bandwidth);
+
+ /* Check range. */
+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ recover_set_bandwidth(rs, bandwidth);
+ return 0;
+ }
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Change state of a device (running/offline). */
+/* FIXME: this only works while recovering!. */
+static int device_state(struct dm_msg *msg, void *context)
+{
+ int r;
+ const char *str = "is already ";
+ union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
+ struct raid_set *rs = context;
+
+ r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
+ bymajmin : byname, &dl);
+ if (r == -ENODEV) {
+ DMERR("device %s is no member of this set", dl.dev_name);
+ return r;
+ }
+
+ if (test_bit(act_off, &msg->action)) {
+ if (dev_operational(rs, r))
+ str = "";
+ } else if (!dev_operational(rs, r))
+ str = "";
+
+ DMINFO("/dev/%s %s%s", dl.dev_name, str,
+ test_bit(act_off, &msg->action) ? "offline" : "running");
+
+ return test_bit(act_off, &msg->action) ?
+ raid_set_check_and_degrade(rs, NULL, r) :
+ raid_set_check_and_upgrade(rs, r);
+}
+
+/* Set/reset development feature flags. */
+static int devel_flags(struct dm_msg *msg, void *context)
+{
+ struct raid_set *rs = context;
+
+ if (test_bit(act_on, &msg->action))
+ return test_and_set_bit(msg->spec->parm,
+ &rs->io.flags) ? -EPERM : 0;
+ else if (test_bit(act_off, &msg->action))
+ return test_and_clear_bit(msg->spec->parm,
+ &rs->io.flags) ? 0 : -EPERM;
+ else if (test_bit(act_reset, &msg->action)) {
+ if (test_bit(act_stats, &msg->action)) {
+ stats_reset(rs);
+ goto on;
+ } else if (test_bit(act_overwrite, &msg->action)) {
+on:
+ set_bit(msg->spec->parm, &rs->io.flags);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+ /* Set stripe and chunk read ahead pages. */
+static int read_ahead_set(struct dm_msg *msg, void *context)
+{
+ int stripes = DM_MSG_INT_ARGS(msg, 0);
+ int chunks = DM_MSG_INT_ARGS(msg, 1);
+
+ if (range_ok(stripes, 1, 512) &&
+ range_ok(chunks, 1, 512)) {
+ rs_set_bdi(context, stripes, chunks);
+ return 0;
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Resize the stripe cache. */
+static int stripecache_resize(struct dm_msg *msg, void *context)
+{
+ int act, stripes;
+ struct raid_set *rs = context;
+
+ /* Deny permission in case the daemon is still shrinking!. */
+ if (atomic_read(&rs->sc.stripes_to_shrink))
+ return -EPERM;
+
+ stripes = DM_MSG_INT_ARG(msg);
+ if (stripes > 0) {
+ act = atomic_read(&rs->sc.stripes);
+
+ /* Make delta stripes absolute. */
+ stripes = _absolute(msg->action, act, stripes);
+
+ /*
+ * Check range and that the # of stripes changes.
+ * We can grow from gere but need to leave any
+ * shrinking to the worker for synchronization.
+ */
+ if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
+ if (stripes > act)
+ return sc_grow(&rs->sc, stripes - act, SC_GROW);
+ else if (stripes < act) {
+ atomic_set(&rs->sc.stripes_to_shrink,
+ act - stripes);
+ wake_do_raid(rs);
+ }
+
+ return 0;
+ }
+ }
+
+ set_bit(dm_msg_ret_arg, &msg->ret);
+ set_bit(dm_msg_ret_inval, &msg->ret);
+ return -EINVAL;
+}
+
+/* Parse the RAID message action. */
+/*
+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
+ * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
+ * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
+ * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2'
+ * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
+ * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
+ *
+ */
+static int
+raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ /* Variables to store the parsed parameters im. */
+ static int i[2];
+ static unsigned long *i_arg[] = {
+ (unsigned long *) i + 0,
+ (unsigned long *) i + 1,
+ };
+ static char *p;
+ static unsigned long *p_arg[] = { (unsigned long *) &p };
+
+ /* Declare all message option strings. */
+ static char *str_sgs[] = { "set", "grow", "shrink" };
+ static char *str_dev[] = { "running", "offline" };
+ static char *str_oor[] = { "on", "off", "reset" };
+
+ /* Declare all actions. */
+ static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
+ static unsigned long act_oor[] = { act_on, act_off, act_reset };
+
+ /* Bandwidth option. */
+ static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
+ static struct dm_message_argument bw_args = {
+ 1, i_arg, { dm_msg_int_t }
+ };
+
+ /* Device option. */
+ static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
+ static struct dm_message_argument dev_args = {
+ 1, p_arg, { dm_msg_base_t }
+ };
+
+ /* Read ahead option. */
+ static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
+ static struct dm_message_argument ra_args = {
+ 2, i_arg, { dm_msg_int_t, dm_msg_int_t }
+ };
+
+ static struct dm_message_argument null_args = {
+ 0, NULL, { dm_msg_int_t }
+ };
+
+ /* Overwrite and statistics option. */
+ static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
+
+ /* Sripecache option. */
+ static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
+
+ /* Declare messages. */
+ static struct dm_msg_spec specs[] = {
+ { "bandwidth", act_bw, &bw_opt, &bw_args,
+ 0, bandwidth_change },
+ { "device", act_dev, &dev_opt, &dev_args,
+ 0, device_state },
+ { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
+ RS_CHECK_OVERWRITE, devel_flags },
+ { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
+ 0, read_ahead_set },
+ { "statistics", act_stats, &ovr_stats_opt, &null_args,
+ RS_DEVEL_STATS, devel_flags },
+ { "stripecache", act_sc, &stripe_opt, &bw_args,
+ 0, stripecache_resize },
+ };
+
+ /* The message for the parser. */
+ struct dm_msg msg = {
+ .num_specs = ARRAY_SIZE(specs),
+ .specs = specs,
+ };
+
+ return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
+}
+/*
+ * END message interface
+ */
+
+static struct target_type raid_target = {
+ .name = "raid45",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+ .status = raid_status,
+ .message = raid_message,
+};
+
+static void init_exit(const char *bad_msg, const char *good_msg, int r)
+{
+ if (r)
+ DMERR("Failed to %sregister target [%d]", bad_msg, r);
+ else
+ DMINFO("%s %s", good_msg, version);
+}
+
+static int __init dm_raid_init(void)
+{
+ int r;
+
+ r = dm_register_target(&raid_target);
+ init_exit("", "initialized", r);
+ return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+ init_exit("un", "exit", 0);
+}
+
+/* Module hooks. */
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
+MODULE_LICENSE("GPL");
--- /dev/null
+++ b/drivers/md/dm-raid45.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
+ *
+ * Locking definitions for the device-mapper RAID45 target.
+ *
+ * This file is released under the GPL.
+ *
+ */
+
+#ifndef _DM_RAID45_H
+#define _DM_RAID45_H
+
+/* Factor out to dm.h! */
+#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr))
+
+enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
+
+struct dm_raid45_locking_type {
+ /* Request a lock on a stripe. */
+ void* (*lock)(sector_t key, enum dm_lock_type type);
+
+ /* Release a lock on a stripe. */
+ void (*unlock)(void *lock_handle);
+};
+
+#endif
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -53,100 +53,6 @@
* 'delayed_bios' fields of the regions. This is used from irq
* context, so all other uses will have to suspend local irqs.
*---------------------------------------------------------------*/
-struct dm_region_hash {
- uint32_t region_size;
- unsigned region_shift;
-
- /* holds persistent region state */
- struct dm_dirty_log *log;
-
- /* hash table */
- rwlock_t hash_lock;
- mempool_t *region_pool;
- unsigned mask;
- unsigned nr_buckets;
- unsigned prime;
- unsigned shift;
- struct list_head *buckets;
-
- unsigned max_recovery; /* Max # of regions to recover in parallel */
-
- spinlock_t region_lock;
- atomic_t recovery_in_flight;
- struct semaphore recovery_count;
- struct list_head clean_regions;
- struct list_head quiesced_regions;
- struct list_head recovered_regions;
- struct list_head failed_recovered_regions;
-
- /*
- * If there was a barrier failure no regions can be marked clean.
- */
- int barrier_failure;
-
- void *context;
- sector_t target_begin;
-
- /* Callback function to schedule bios writes */
- void (*dispatch_bios)(void *context, struct bio_list *bios);
-
- /* Callback function to wakeup callers worker thread. */
- void (*wakeup_workers)(void *context);
-
- /* Callback function to wakeup callers recovery waiters. */
- void (*wakeup_all_recovery_waiters)(void *context);
-};
-
-struct dm_region {
- struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
- region_t key;
- int state;
-
- struct list_head hash_list;
- struct list_head list;
-
- atomic_t pending;
- struct bio_list delayed_bios;
-};
-
-/*
- * Conversion fns
- */
-static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
-{
- return sector >> rh->region_shift;
-}
-
-sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
-{
- return region << rh->region_shift;
-}
-EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
-
-region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
-{
- return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
-}
-EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
-
-void *dm_rh_region_context(struct dm_region *reg)
-{
- return reg->rh->context;
-}
-EXPORT_SYMBOL_GPL(dm_rh_region_context);
-
-region_t dm_rh_get_region_key(struct dm_region *reg)
-{
- return reg->key;
-}
-EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
-
-sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
-{
- return rh->region_size;
-}
-EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
-
/*
* FIXME: shall we pass in a structure instead of all these args to
* dm_region_hash_create()????
@@ -495,7 +401,7 @@ void dm_rh_update_states(struct dm_regio
}
EXPORT_SYMBOL_GPL(dm_rh_update_states);
-static void rh_inc(struct dm_region_hash *rh, region_t region)
+void dm_rh_inc(struct dm_region_hash *rh, region_t region)
{
struct dm_region *reg;
@@ -517,6 +423,7 @@ static void rh_inc(struct dm_region_hash
read_unlock(&rh->hash_lock);
}
+EXPORT_SYMBOL_GPL(dm_rh_inc);
void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
{
@@ -525,7 +432,7 @@ void dm_rh_inc_pending(struct dm_region_
for (bio = bios->head; bio; bio = bio->bi_next) {
if (bio_empty_barrier(bio))
continue;
- rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+ dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
}
EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
@@ -614,8 +521,9 @@ static int __rh_recovery_prepare(struct
return 1;
}
-void dm_rh_recovery_prepare(struct dm_region_hash *rh)
+int dm_rh_recovery_prepare(struct dm_region_hash *rh)
{
+ int r = 0;
/* Extra reference to avoid race with dm_rh_stop_recovery */
atomic_inc(&rh->recovery_in_flight);
@@ -624,13 +532,17 @@ void dm_rh_recovery_prepare(struct dm_re
if (__rh_recovery_prepare(rh) <= 0) {
atomic_dec(&rh->recovery_in_flight);
up(&rh->recovery_count);
+ r = -ENOENT;
break;
}
}
/* Drop the extra reference */
- if (atomic_dec_and_test(&rh->recovery_in_flight))
+ if (atomic_dec_and_test(&rh->recovery_in_flight)) {
rh->wakeup_all_recovery_waiters(rh->context);
+ r = -ESRCH;
+ }
+ return r;
}
EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2673,6 +2673,7 @@ struct gendisk *dm_disk(struct mapped_de
{
return md->disk;
}
+EXPORT_SYMBOL_GPL(dm_disk);
struct kobject *dm_kobject(struct mapped_device *md)
{
--- a/include/linux/dm-region-hash.h
+++ b/include/linux/dm-region-hash.h
@@ -15,8 +15,62 @@
/*-----------------------------------------------------------------
* Region hash
*----------------------------------------------------------------*/
-struct dm_region_hash;
-struct dm_region;
+struct dm_region_hash {
+ uint32_t region_size;
+ unsigned region_shift;
+
+ /* holds persistent region state */
+ struct dm_dirty_log *log;
+
+ /* hash table */
+ rwlock_t hash_lock;
+ mempool_t *region_pool;
+ unsigned mask;
+ unsigned nr_buckets;
+ unsigned prime;
+ unsigned shift;
+ struct list_head *buckets;
+
+ unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+ spinlock_t region_lock;
+ atomic_t recovery_in_flight;
+ struct semaphore recovery_count;
+ struct list_head clean_regions;
+ struct list_head quiesced_regions;
+ struct list_head recovered_regions;
+ struct list_head failed_recovered_regions;
+
+ /*
+ * If there was a barrier failure no regions can be marked clean.
+ */
+ int barrier_failure;
+
+ void *context;
+ sector_t target_begin;
+
+ /* Callback function to schedule bios writes */
+ void (*dispatch_bios)(void *context, struct bio_list *bios);
+
+ /* Callback function to wakeup callers worker thread. */
+ void (*wakeup_workers)(void *context);
+
+ /* Callback function to wakeup callers recovery waiters. */
+ void (*wakeup_all_recovery_waiters)(void *context);
+};
+
+struct dm_region {
+ struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
+ region_t key;
+ int state;
+
+ struct list_head hash_list;
+ struct list_head list;
+
+ atomic_t pending;
+ struct bio_list delayed_bios;
+};
+
/*
* States a region can have.
@@ -45,19 +99,6 @@ void dm_region_hash_destroy(struct dm_re
struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh);
/*
- * Conversion functions.
- */
-region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio);
-sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region);
-void *dm_rh_region_context(struct dm_region *reg);
-
-/*
- * Get region size and key (ie. number of the region).
- */
-sector_t dm_rh_get_region_size(struct dm_region_hash *rh);
-region_t dm_rh_get_region_key(struct dm_region *reg);
-
-/*
* Get/set/update region state (and dirty log).
*
*/
@@ -73,6 +114,7 @@ int dm_rh_flush(struct dm_region_hash *r
/* Inc/dec pending count on regions. */
void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios);
+void dm_rh_inc(struct dm_region_hash *rh, region_t region);
void dm_rh_dec(struct dm_region_hash *rh, region_t region);
/* Delay bios on regions. */
@@ -85,7 +127,7 @@ void dm_rh_mark_nosync(struct dm_region_
*/
/* Prepare some regions for recovery by starting to quiesce them. */
-void dm_rh_recovery_prepare(struct dm_region_hash *rh);
+int dm_rh_recovery_prepare(struct dm_region_hash *rh);
/* Try fetching a quiesced region for recovery. */
struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh);
@@ -100,4 +142,39 @@ int dm_rh_recovery_in_flight(struct dm_r
void dm_rh_start_recovery(struct dm_region_hash *rh);
void dm_rh_stop_recovery(struct dm_region_hash *rh);
+/*
+ * Conversion fns
+ */
+static inline region_t dm_rh_sector_to_region(struct dm_region_hash *rh,
+ sector_t sector)
+{
+ return sector >> rh->region_shift;
+}
+
+static inline sector_t dm_rh_region_to_sector(struct dm_region_hash *rh,
+ region_t region)
+{
+ return region << rh->region_shift;
+}
+
+static inline region_t dm_rh_bio_to_region(struct dm_region_hash *rh,
+ struct bio *bio)
+{
+ return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+}
+
+static inline void *dm_rh_region_context(struct dm_region *reg)
+{
+ return reg->rh->context;
+}
+
+static inline region_t dm_rh_get_region_key(struct dm_region *reg)
+{
+ return reg->key;
+}
+
+static inline sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
+{
+ return rh->region_size;
+}
#endif /* DM_REGION_HASH_H */