qubes-linux-kernel/patches.xen/xen3-auto-blktap2-pvops.diff
2011-04-19 22:09:59 +02:00

2374 lines
53 KiB
Diff

Subject: pv-ops blktap2
From: https://git.kernel.org/?p=linux/kernel/git/jeremy/xen.git (commit 892d2f052e979cf1916647c752b94cf62ec1c6dc)
Patch-mainline: n/a
Acked-by: jbeulich@novell.com
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/Makefile 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-objs := control.o ring.o device.o request.o sysfs.o
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/blktap.h 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,209 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...) \
+ do { \
+ if (blktap_debug_level > level && \
+ (force || printk_ratelimit())) \
+ printk(tag "%s: " _f, __func__, ##_a); \
+ } while (0)
+
+#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define MAX_BLKTAP_DEVICE 1024
+
+#define BLKTAP_DEVICE 4
+#define BLKTAP_DEVICE_CLOSED 5
+#define BLKTAP_SHUTDOWN_REQUESTED 8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE 1
+#define BLKTAP2_IOCTL_ALLOC_TAP 200
+#define BLKTAP2_IOCTL_FREE_TAP 201
+#define BLKTAP2_IOCTL_CREATE_DEVICE 202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE 207
+
+#define BLKTAP2_MAX_MESSAGE_LEN 256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE 3
+
+#define BLKTAP_REQUEST_FREE 0
+#define BLKTAP_REQUEST_PENDING 1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST]
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM BLK_RING_SIZE
+#define MAX_PENDING_REQS BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+ grant_handle_t kernel;
+ grant_handle_t user;
+};
+#define INVALID_GRANT_HANDLE 0xFFFF
+
+struct blktap_handle {
+ unsigned int ring;
+ unsigned int device;
+ unsigned int minor;
+};
+
+struct blktap_params {
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+ unsigned long long capacity;
+ unsigned long sector_size;
+};
+
+struct blktap_device {
+ spinlock_t lock;
+ struct gendisk *gd;
+};
+
+struct blktap_ring {
+ struct task_struct *task;
+
+ struct vm_area_struct *vma;
+ struct blkif_front_ring ring;
+ unsigned long ring_vstart;
+ unsigned long user_vstart;
+
+ int n_pending;
+ struct blktap_request *pending[MAX_PENDING_REQS];
+
+ wait_queue_head_t poll_wait;
+
+ dev_t devno;
+ struct device *dev;
+};
+
+struct blktap_statistics {
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;
+ int st_rd_sect;
+ int st_wr_sect;
+ s64 st_rd_cnt;
+ s64 st_rd_sum_usecs;
+ s64 st_rd_max_usecs;
+ s64 st_wr_cnt;
+ s64 st_wr_sum_usecs;
+ s64 st_wr_max_usecs;
+};
+
+struct blktap_request {
+ struct blktap *tap;
+ struct request *rq;
+ int usr_idx;
+
+ int operation;
+ struct timeval time;
+
+ struct scatterlist sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i) \
+ for (_sg = (_req)->sg_table, _i = 0; \
+ _i < (_req)->nr_pages; \
+ (_sg)++, (_i)++)
+
+struct blktap {
+ int minor;
+ unsigned long dev_inuse;
+
+ struct blktap_ring ring;
+ struct blktap_device device;
+ struct blktap_page_pool *pool;
+
+ wait_queue_head_t remove_wait;
+ struct work_struct remove_work;
+ char name[BLKTAP2_MAX_MESSAGE_LEN];
+
+ struct blktap_statistics stats;
+};
+
+struct blktap_page_pool {
+ struct mempool_s *bufs;
+ spinlock_t lock;
+ struct kobject kobj;
+ wait_queue_head_t wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/control.c 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,315 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+ int minor;
+ struct blktap *tap;
+
+ tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+ if (unlikely(!tap))
+ return NULL;
+
+ mutex_lock(&blktap_lock);
+
+ for (minor = 0; minor < blktap_max_minor; minor++)
+ if (!blktaps[minor])
+ break;
+
+ if (minor == MAX_BLKTAP_DEVICE)
+ goto fail;
+
+ if (minor == blktap_max_minor) {
+ void *p;
+ int n;
+
+ n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE);
+ p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+ if (!p)
+ goto fail;
+
+ blktaps = p;
+ minor = blktap_max_minor;
+ blktap_max_minor = n;
+
+ memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+ }
+
+ tap->minor = minor;
+ blktaps[minor] = tap;
+
+ __module_get(THIS_MODULE);
+out:
+ mutex_unlock(&blktap_lock);
+ return tap;
+
+fail:
+ mutex_unlock(&blktap_lock);
+ kfree(tap);
+ tap = NULL;
+ goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+ blktaps[tap->minor] = NULL;
+ kfree(tap);
+
+ module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+ struct blktap *tap;
+ int err;
+
+ tap = blktap_control_get_minor();
+ if (!tap)
+ return NULL;
+
+ kobject_get(&default_pool->kobj);
+ tap->pool = default_pool;
+
+ err = blktap_ring_create(tap);
+ if (err)
+ goto fail_tap;
+
+ err = blktap_sysfs_create(tap);
+ if (err)
+ goto fail_ring;
+
+ return tap;
+
+fail_ring:
+ blktap_ring_destroy(tap);
+fail_tap:
+ blktap_control_put_minor(tap);
+
+ return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+ int err;
+
+ err = blktap_ring_destroy(tap);
+ if (err)
+ return err;
+
+ kobject_put(&tap->pool->kobj);
+
+ blktap_sysfs_destroy(tap);
+
+ blktap_control_put_minor(tap);
+
+ return 0;
+}
+
+static int
+blktap_control_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct blktap *tap;
+
+ switch (cmd) {
+ case BLKTAP2_IOCTL_ALLOC_TAP: {
+ struct blktap_handle h;
+ void __user *ptr = (void __user*)arg;
+
+ tap = blktap_control_create_tap();
+ if (!tap)
+ return -ENOMEM;
+
+ h.ring = blktap_ring_major;
+ h.device = blktap_device_major;
+ h.minor = tap->minor;
+
+ if (copy_to_user(ptr, &h, sizeof(h))) {
+ blktap_control_destroy_tap(tap);
+ return -EFAULT;
+ }
+
+ return 0;
+ }
+
+ case BLKTAP2_IOCTL_FREE_TAP: {
+ int minor = arg;
+
+ if (minor > MAX_BLKTAP_DEVICE)
+ return -EINVAL;
+
+ tap = blktaps[minor];
+ if (!tap)
+ return -ENODEV;
+
+ return blktap_control_destroy_tap(tap);
+ }
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static struct file_operations blktap_control_file_operations = {
+ .owner = THIS_MODULE,
+ .ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "blktap-control",
+ .fops = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap_page_pool *pool, *tmp = default_pool;
+
+ pool = blktap_page_pool_get(buf);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ default_pool = pool;
+ kobject_put(&tmp->kobj);
+
+ return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+ blktap_control_show_default_pool,
+ blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+ char *s = buf, *end = buf + size;
+
+ s += snprintf(s, end - s,
+ "tap %u:%u name:'%s' flags:%#08lx\n",
+ MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+ tap->name, tap->dev_inuse);
+
+ return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+ int err;
+
+ err = misc_register(&blktap_control);
+ if (err)
+ return err;
+
+ control_device = blktap_control.this_device;
+
+ blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
+ blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+ if (!blktaps) {
+ BTERR("failed to allocate blktap minor map");
+ return -ENOMEM;
+ }
+
+ err = blktap_page_pool_init(&control_device->kobj);
+ if (err)
+ return err;
+
+ default_pool = blktap_page_pool_get("default");
+ if (!default_pool)
+ return -ENOMEM;
+
+ err = device_create_file(control_device, &dev_attr_default_pool);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+ if (default_pool) {
+ kobject_put(&default_pool->kobj);
+ default_pool = NULL;
+ }
+
+ blktap_page_pool_exit();
+
+ if (blktaps) {
+ kfree(blktaps);
+ blktaps = NULL;
+ }
+
+ if (control_device) {
+ misc_deregister(&blktap_control);
+ control_device = NULL;
+ }
+}
+
+static void
+blktap_exit(void)
+{
+ blktap_control_exit();
+ blktap_ring_exit();
+ blktap_sysfs_exit();
+ blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+ int err;
+
+ err = blktap_device_init();
+ if (err)
+ goto fail;
+
+ err = blktap_ring_init();
+ if (err)
+ goto fail;
+
+ err = blktap_sysfs_init();
+ if (err)
+ goto fail;
+
+ err = blktap_control_init();
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ blktap_exit();
+ return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/device.c 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,564 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct blktap_device *tapdev = disk->private_data;
+
+ if (!tapdev)
+ return -ENXIO;
+
+ /* NB. we might have bounced a bd trylock by tapdisk. when
+ * failing for reasons not !tapdev, make sure to kick tapdisk
+ * out of destroy wait state again. */
+
+ return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+ struct blktap_device *tapdev = disk->private_data;
+ struct block_device *bdev = bdget_disk(disk, 0);
+ struct blktap *tap = dev_to_blktap(tapdev);
+
+ bdput(bdev);
+
+ if (!bdev->bd_openers) {
+ set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+ blktap_ring_kick_user(tap);
+ }
+
+ return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+ /* We don't have real geometry info, but let's at least return
+ values consistent with the size of the device */
+ sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t cylinders = nsect;
+
+ hg->heads = 0xff;
+ hg->sectors = 0x3f;
+ sector_div(cylinders, hg->heads * hg->sectors);
+ hg->cylinders = cylinders;
+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+ hg->cylinders = 0xffff;
+ return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+ unsigned command, unsigned long argument)
+{
+ int i;
+
+ switch (command) {
+ case CDROMMULTISESSION:
+ BTDBG("FIXME: support multisession CDs later\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case SCSI_IOCTL_GET_IDLUN:
+ if (!access_ok(VERIFY_WRITE, argument,
+ sizeof(struct scsi_idlun)))
+ return -EFAULT;
+
+ /* return 0 for now. */
+ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+ __put_user(0,
+ &((struct scsi_idlun __user *)argument)->host_unique_id);
+ return 0;
+
+ default:
+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+ command);*/
+ return -EINVAL; /* same return as native Linux */
+ }
+
+ return 0;
+}
+
+static struct block_device_operations blktap_device_file_operations = {
+ .owner = THIS_MODULE,
+ .open = blktap_device_open,
+ .release = blktap_device_release,
+ .ioctl = blktap_device_ioctl,
+ .getgeo = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+ return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+ blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+ blk_start_request(rq);
+ __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+ __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+ spin_lock_irq(rq->q->queue_lock);
+ __blktap_end_rq(rq, err);
+ spin_unlock_irq(rq->q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+ struct blktap_request *request,
+ int error)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request *rq = request->rq;
+
+ blktap_ring_unmap_request(tap, request);
+
+ blktap_ring_free_request(tap, request);
+
+ dev_dbg(disk_to_dev(tapdev->gd),
+ "end_request: op=%d error=%d bytes=%d\n",
+ rq_data_dir(rq), error, blk_rq_bytes(rq));
+
+ blktap_end_rq(rq, error);
+}
+
+int
+blktap_device_make_request(struct blktap *tap, struct request *rq)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct blktap_request *request;
+ int write, nsegs;
+ int err;
+
+ request = blktap_ring_make_request(tap);
+ if (IS_ERR(request)) {
+ err = PTR_ERR(request);
+ request = NULL;
+
+ if (err == -ENOSPC || err == -ENOMEM)
+ goto stop;
+
+ goto fail;
+ }
+
+ write = rq_data_dir(rq) == WRITE;
+ nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
+
+ dev_dbg(disk_to_dev(tapdev->gd),
+ "make_request: op=%c bytes=%d nsegs=%d\n",
+ write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
+
+ request->rq = rq;
+ request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
+
+ err = blktap_request_get_pages(tap, request, nsegs);
+ if (err)
+ goto stop;
+
+ err = blktap_ring_map_request(tap, request);
+ if (err)
+ goto fail;
+
+ blktap_ring_submit_request(tap, request);
+
+ return 0;
+
+stop:
+ tap->stats.st_oo_req++;
+ err = -EBUSY;
+
+_out:
+ if (request)
+ blktap_ring_free_request(tap, request);
+
+ return err;
+fail:
+ if (printk_ratelimit())
+ dev_warn(disk_to_dev(tapdev->gd),
+ "make request: %d, failing\n", err);
+ goto _out;
+}
+
+/*
+ * called from tapdisk context
+ */
+void
+blktap_device_run_queue(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request_queue *q;
+ struct request *rq;
+ int err;
+
+ if (!tapdev->gd)
+ return;
+
+ q = tapdev->gd->queue;
+
+ spin_lock_irq(&tapdev->lock);
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+ do {
+ rq = __blktap_next_queued_rq(q);
+ if (!rq)
+ break;
+
+ if (!blk_fs_request(rq)) {
+ __blktap_end_queued_rq(rq, -EOPNOTSUPP);
+ continue;
+ }
+
+ spin_unlock_irq(&tapdev->lock);
+
+ err = blktap_device_make_request(tap, rq);
+
+ spin_lock_irq(&tapdev->lock);
+
+ if (err == -EBUSY) {
+ blk_stop_queue(q);
+ break;
+ }
+
+ __blktap_dequeue_rq(rq);
+
+ if (unlikely(err))
+ __blktap_end_rq(rq, err);
+ } while (1);
+
+ spin_unlock_irq(&tapdev->lock);
+}
+
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+ struct blktap_device *tapdev = rq->queuedata;
+ struct blktap *tap = dev_to_blktap(tapdev);
+
+ blktap_ring_kick_user(tap);
+}
+
+static void
+blktap_device_configure(struct blktap *tap,
+ struct blktap_params *params)
+{
+ struct request_queue *rq;
+ struct blktap_device *dev = &tap->device;
+
+ dev = &tap->device;
+ rq = dev->gd->queue;
+
+ spin_lock_irq(&dev->lock);
+
+ set_capacity(dev->gd, params->capacity);
+
+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
+ blk_queue_logical_block_size(rq, params->sector_size);
+ blk_queue_max_sectors(rq, 512);
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+ /* Ensure a merged request will fit in a single I/O ring slot. */
+ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ /* Make sure buffer addresses are sector-aligned. */
+ blk_queue_dma_alignment(rq, 511);
+
+ /* We are reordering, but cacheless. */
+ blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL);
+
+ spin_unlock_irq(&dev->lock);
+}
+
+static int
+blktap_device_validate_params(struct blktap *tap,
+ struct blktap_params *params)
+{
+ struct device *dev = tap->ring.dev;
+ int sector_order, name_sz;
+
+ sector_order = ffs(params->sector_size) - 1;
+
+ if (sector_order < 9 ||
+ sector_order > 12 ||
+ params->sector_size != 1U<<sector_order)
+ goto fail;
+
+ if (!params->capacity ||
+ (params->capacity > ULLONG_MAX >> sector_order))
+ goto fail;
+
+ name_sz = min(sizeof(params->name), sizeof(tap->name));
+ if (strnlen(params->name, name_sz) >= name_sz)
+ goto fail;
+
+ return 0;
+
+fail:
+ params->name[name_sz-1] = 0;
+ dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
+ params->capacity, params->sector_size, params->name);
+ return -EINVAL;
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct block_device *bdev;
+ struct gendisk *gd;
+ int err;
+
+ gd = tapdev->gd;
+ if (!gd)
+ return 0;
+
+ bdev = bdget_disk(gd, 0);
+
+ err = !mutex_trylock(&bdev->bd_mutex);
+ if (err) {
+ /* NB. avoid a deadlock. the last opener syncs the
+ * bdev holding bd_mutex. */
+ err = -EBUSY;
+ goto out_nolock;
+ }
+
+ if (bdev->bd_openers) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ del_gendisk(gd);
+ gd->private_data = NULL;
+
+ blk_cleanup_queue(gd->queue);
+
+ put_disk(gd);
+ tapdev->gd = NULL;
+
+ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+ err = 0;
+out:
+ mutex_unlock(&bdev->bd_mutex);
+out_nolock:
+ bdput(bdev);
+
+ return err;
+}
+
+static void
+blktap_device_fail_queue(struct blktap *tap)
+{
+ struct blktap_device *tapdev = &tap->device;
+ struct request_queue *q = tapdev->gd->queue;
+
+ spin_lock_irq(&tapdev->lock);
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+ do {
+ struct request *rq = __blktap_next_queued_rq(q);
+ if (!rq)
+ break;
+
+ __blktap_end_queued_rq(rq, -EIO);
+ } while (1);
+
+ spin_unlock_irq(&tapdev->lock);
+}
+
+static int
+blktap_device_try_destroy(struct blktap *tap)
+{
+ int err;
+
+ err = blktap_device_destroy(tap);
+ if (err)
+ blktap_device_fail_queue(tap);
+
+ return err;
+}
+
+void
+blktap_device_destroy_sync(struct blktap *tap)
+{
+ wait_event(tap->ring.poll_wait,
+ !blktap_device_try_destroy(tap));
+}
+
+int
+blktap_device_create(struct blktap *tap, struct blktap_params *params)
+{
+ int minor, err;
+ struct gendisk *gd;
+ struct request_queue *rq;
+ struct blktap_device *tapdev;
+
+ gd = NULL;
+ rq = NULL;
+ tapdev = &tap->device;
+ minor = tap->minor;
+
+ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+ return -EEXIST;
+
+ if (blktap_device_validate_params(tap, params))
+ return -EINVAL;
+
+ gd = alloc_disk(1);
+ if (!gd) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ if (minor < 26) {
+ sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
+ } else if (minor < (26 + 1) * 26) {
+ sprintf(gd->disk_name, "td%c%c",
+ 'a' + minor / 26 - 1,'a' + minor % 26);
+ } else {
+ const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
+ const unsigned int m2 = (minor / 26 - 1) % 26;
+ const unsigned int m3 = minor % 26;
+ sprintf(gd->disk_name, "td%c%c%c",
+ 'a' + m1, 'a' + m2, 'a' + m3);
+ }
+
+ gd->major = blktap_device_major;
+ gd->first_minor = minor;
+ gd->fops = &blktap_device_file_operations;
+ gd->private_data = tapdev;
+
+ spin_lock_init(&tapdev->lock);
+ rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
+ if (!rq) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ elevator_init(rq, "noop");
+
+ gd->queue = rq;
+ rq->queuedata = tapdev;
+ tapdev->gd = gd;
+
+ blktap_device_configure(tap, params);
+ add_disk(gd);
+
+ if (params->name[0])
+ strncpy(tap->name, params->name, sizeof(tap->name)-1);
+
+ set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+
+ dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
+ queue_logical_block_size(rq),
+ (unsigned long long)get_capacity(gd));
+
+ return 0;
+
+fail:
+ if (gd)
+ del_gendisk(gd);
+ if (rq)
+ blk_cleanup_queue(rq);
+
+ return err;
+}
+
+size_t
+blktap_device_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct gendisk *disk = tap->device.gd;
+ struct request_queue *q;
+ struct block_device *bdev;
+ char *s = buf, *end = buf + size;
+
+ if (!disk)
+ return 0;
+
+ q = disk->queue;
+
+ s += snprintf(s, end - s,
+ "disk capacity:%llu sector size:%u\n",
+ (unsigned long long)get_capacity(disk),
+ queue_logical_block_size(q));
+
+ s += snprintf(s, end - s,
+ "queue flags:%#lx plugged:%d stopped:%d empty:%d\n",
+ q->queue_flags,
+ blk_queue_plugged(q), blk_queue_stopped(q),
+ elv_queue_empty(q));
+
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ s += snprintf(s, end - s,
+ "bdev openers:%d closed:%d\n",
+ bdev->bd_openers,
+ test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
+ bdput(bdev);
+ }
+
+ return s - buf;
+}
+
+int __init
+blktap_device_init()
+{
+ int major;
+
+ /* Dynamically allocate a major for this device */
+ major = register_blkdev(0, "tapdev");
+ if (major < 0) {
+ BTERR("Couldn't register blktap device\n");
+ return -ENOMEM;
+ }
+
+ blktap_device_major = major;
+ BTINFO("blktap device major %d\n", major);
+
+ return 0;
+}
+
+void
+blktap_device_exit(void)
+{
+ if (blktap_device_major)
+ unregister_blkdev(blktap_device_major, "tapdev");
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/request.c 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,418 @@
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+
+#include "blktap.h"
+
+/* max pages per shared pool. just to prevent accidental dos. */
+#define POOL_MAX_PAGES (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
+
+/* default page pool size. when considering to shrink a shared pool,
+ * note that paused tapdisks may grab a whole lot of pages for a long
+ * time. */
+#define POOL_DEFAULT_PAGES (2 * MMAP_PAGES)
+
+/* max number of pages allocatable per request. */
+#define POOL_MAX_REQUEST_PAGES BLKIF_MAX_SEGMENTS_PER_REQUEST
+
+/* min request structs per pool. These grow dynamically. */
+#define POOL_MIN_REQS BLK_RING_SIZE
+
+static struct kset *pool_set;
+
+#define kobj_to_pool(_kobj) \
+ container_of(_kobj, struct blktap_page_pool, kobj)
+
+static struct kmem_cache *request_cache;
+static mempool_t *request_pool;
+
+static void
+__page_pool_wake(struct blktap_page_pool *pool)
+{
+ mempool_t *mem = pool->bufs;
+
+ /*
+ NB. slightly wasteful to always wait for a full segment
+ set. but this ensures the next disk makes
+ progress. presently, the repeated request struct
+ alloc/release cycles would otherwise keep everyone spinning.
+ */
+
+ if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
+ wake_up(&pool->wait);
+}
+
+int
+blktap_request_get_pages(struct blktap *tap,
+ struct blktap_request *request, int nr_pages)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ mempool_t *mem = pool->bufs;
+ struct page *page;
+
+ BUG_ON(request->nr_pages != 0);
+ BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
+
+ if (mem->curr_nr < nr_pages)
+ return -ENOMEM;
+
+ /* NB. avoid thundering herds of tapdisks colliding. */
+ spin_lock(&pool->lock);
+
+ if (mem->curr_nr < nr_pages) {
+ spin_unlock(&pool->lock);
+ return -ENOMEM;
+ }
+
+ while (request->nr_pages < nr_pages) {
+ page = mempool_alloc(mem, GFP_NOWAIT);
+ BUG_ON(!page);
+ request->pages[request->nr_pages++] = page;
+ }
+
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+static void
+blktap_request_put_pages(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ struct page *page;
+
+ while (request->nr_pages) {
+ page = request->pages[--request->nr_pages];
+ mempool_free(page, pool->bufs);
+ }
+}
+
+size_t
+blktap_request_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct blktap_page_pool *pool = tap->pool;
+ mempool_t *mem = pool->bufs;
+ char *s = buf, *end = buf + size;
+
+ s += snprintf(buf, end - s,
+ "pool:%s pages:%d free:%d\n",
+ kobject_name(&pool->kobj),
+ mem->min_nr, mem->curr_nr);
+
+ return s - buf;
+}
+
+struct blktap_request*
+blktap_request_alloc(struct blktap *tap)
+{
+ struct blktap_request *request;
+
+ request = mempool_alloc(request_pool, GFP_NOWAIT);
+ if (request)
+ request->tap = tap;
+
+ return request;
+}
+
+void
+blktap_request_free(struct blktap *tap,
+ struct blktap_request *request)
+{
+ blktap_request_put_pages(tap, request);
+
+ mempool_free(request, request_pool);
+
+ __page_pool_wake(tap->pool);
+}
+
+void
+blktap_request_bounce(struct blktap *tap,
+ struct blktap_request *request,
+ int seg, int write)
+{
+ struct scatterlist *sg = &request->sg_table[seg];
+ void *s, *p;
+
+ BUG_ON(seg >= request->nr_pages);
+
+ s = sg_virt(sg);
+ p = page_address(request->pages[seg]) + sg->offset;
+
+ if (write)
+ memcpy(p, s, sg->length);
+ else
+ memcpy(s, p, sg->length);
+}
+
+static void
+blktap_request_ctor(void *obj)
+{
+ struct blktap_request *request = obj;
+
+ memset(request, 0, sizeof(*request));
+ sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
+}
+
+static int
+blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
+{
+ mempool_t *bufs = pool->bufs;
+ int err;
+
+ /* NB. mempool asserts min_nr >= 1 */
+ target = max(1, target);
+
+ err = mempool_resize(bufs, target, GFP_KERNEL);
+ if (err)
+ return err;
+
+ __page_pool_wake(pool);
+
+ return 0;
+}
+
+struct pool_attribute {
+ struct attribute attr;
+
+ ssize_t (*show)(struct blktap_page_pool *pool,
+ char *buf);
+
+ ssize_t (*store)(struct blktap_page_pool *pool,
+ const char *buf, size_t count);
+};
+
+#define kattr_to_pool_attr(_kattr) \
+ container_of(_kattr, struct pool_attribute, attr)
+
+static ssize_t
+blktap_page_pool_show_size(struct blktap_page_pool *pool,
+ char *buf)
+{
+ mempool_t *mem = pool->bufs;
+ return sprintf(buf, "%d", mem->min_nr);
+}
+
+static ssize_t
+blktap_page_pool_store_size(struct blktap_page_pool *pool,
+ const char *buf, size_t size)
+{
+ int target;
+
+ /*
+ * NB. target fixup to avoid undesired results. less than a
+ * full segment set can wedge the disk. much more than a
+ * couple times the physical queue depth is rarely useful.
+ */
+
+ target = simple_strtoul(buf, NULL, 0);
+ target = max(POOL_MAX_REQUEST_PAGES, target);
+ target = min(target, POOL_MAX_PAGES);
+
+ return blktap_page_pool_resize(pool, target) ? : size;
+}
+
+static struct pool_attribute blktap_page_pool_attr_size =
+ __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+ blktap_page_pool_show_size,
+ blktap_page_pool_store_size);
+
+static ssize_t
+blktap_page_pool_show_free(struct blktap_page_pool *pool,
+ char *buf)
+{
+ mempool_t *mem = pool->bufs;
+ return sprintf(buf, "%d", mem->curr_nr);
+}
+
+static struct pool_attribute blktap_page_pool_attr_free =
+ __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
+ blktap_page_pool_show_free,
+ NULL);
+
+static struct attribute *blktap_page_pool_attrs[] = {
+ &blktap_page_pool_attr_size.attr,
+ &blktap_page_pool_attr_free.attr,
+ NULL,
+};
+
+static inline struct kobject*
+__blktap_kset_find_obj(struct kset *kset, const char *name)
+{
+ struct kobject *k;
+ struct kobject *ret = NULL;
+
+ spin_lock(&kset->list_lock);
+ list_for_each_entry(k, &kset->list, entry) {
+ if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+ ret = kobject_get(k);
+ break;
+ }
+ }
+ spin_unlock(&kset->list_lock);
+ return ret;
+}
+
+static ssize_t
+blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
+ char *buf)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+ if (attr->show)
+ return attr->show(pool, buf);
+
+ return -EIO;
+}
+
+static ssize_t
+blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
+ const char *buf, size_t size)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+ if (attr->show)
+ return attr->store(pool, buf, size);
+
+ return -EIO;
+}
+
+static struct sysfs_ops blktap_page_pool_sysfs_ops = {
+ .show = blktap_page_pool_show_attr,
+ .store = blktap_page_pool_store_attr,
+};
+
+static void
+blktap_page_pool_release(struct kobject *kobj)
+{
+ struct blktap_page_pool *pool = kobj_to_pool(kobj);
+ mempool_destroy(pool->bufs);
+ kfree(pool);
+}
+
+struct kobj_type blktap_page_pool_ktype = {
+ .release = blktap_page_pool_release,
+ .sysfs_ops = &blktap_page_pool_sysfs_ops,
+ .default_attrs = blktap_page_pool_attrs,
+};
+
+static void*
+__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
+{
+ struct page *page;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return NULL;
+
+ page = alloc_page(gfp_mask);
+ if (page)
+ SetPageReserved(page);
+
+ return page;
+}
+
+static void
+__mempool_page_free(void *element, void *pool_data)
+{
+ struct page *page = element;
+
+ ClearPageReserved(page);
+ put_page(page);
+}
+
+static struct kobject*
+blktap_page_pool_create(const char *name, int nr_pages)
+{
+ struct blktap_page_pool *pool;
+ int err;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ goto fail;
+
+ spin_lock_init(&pool->lock);
+ init_waitqueue_head(&pool->wait);
+
+ pool->bufs = mempool_create(nr_pages,
+ __mempool_page_alloc, __mempool_page_free,
+ pool);
+ if (!pool->bufs)
+ goto fail_pool;
+
+ kobject_init(&pool->kobj, &blktap_page_pool_ktype);
+ pool->kobj.kset = pool_set;
+ err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
+ if (err)
+ goto fail_bufs;
+
+ return &pool->kobj;
+
+ kobject_del(&pool->kobj);
+fail_bufs:
+ mempool_destroy(pool->bufs);
+fail_pool:
+ kfree(pool);
+fail:
+ return NULL;
+}
+
+struct blktap_page_pool*
+blktap_page_pool_get(const char *name)
+{
+ struct kobject *kobj;
+
+ kobj = __blktap_kset_find_obj(pool_set, name);
+ if (!kobj)
+ kobj = blktap_page_pool_create(name,
+ POOL_DEFAULT_PAGES);
+ if (!kobj)
+ return ERR_PTR(-ENOMEM);
+
+ return kobj_to_pool(kobj);
+}
+
+int __init
+blktap_page_pool_init(struct kobject *parent)
+{
+ request_cache =
+ kmem_cache_create("blktap-request",
+ sizeof(struct blktap_request), 0,
+ 0, blktap_request_ctor);
+ if (!request_cache)
+ return -ENOMEM;
+
+ request_pool =
+ mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
+ if (!request_pool)
+ return -ENOMEM;
+
+ pool_set = kset_create_and_add("pools", NULL, parent);
+ if (!pool_set)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void
+blktap_page_pool_exit(void)
+{
+ if (pool_set) {
+ BUG_ON(!list_empty(&pool_set->list));
+ kset_unregister(pool_set);
+ pool_set = NULL;
+ }
+
+ if (request_pool) {
+ mempool_destroy(request_pool);
+ request_pool = NULL;
+ }
+
+ if (request_cache) {
+ kmem_cache_destroy(request_cache);
+ request_cache = NULL;
+ }
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/ring.c 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,550 @@
+
+#include <linux/device.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_ring_major;
+static struct cdev blktap_ring_cdev;
+
+ /*
+ * BLKTAP - immediately before the mmap area,
+ * we have a bunch of pages reserved for shared memory rings.
+ */
+#define RING_PAGES 1
+
+static void
+blktap_ring_read_response(struct blktap *tap,
+ const struct blkif_response *rsp)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx, err;
+
+ request = NULL;
+
+ usr_idx = rsp->id;
+ if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
+ err = -ERANGE;
+ goto invalid;
+ }
+
+ request = ring->pending[usr_idx];
+
+ if (!request) {
+ err = -ESRCH;
+ goto invalid;
+ }
+
+ if (rsp->operation != request->operation) {
+ err = -EINVAL;
+ goto invalid;
+ }
+
+ dev_dbg(ring->dev,
+ "request %d [%p] response: %d\n",
+ request->usr_idx, request, rsp->status);
+
+ err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+end_request:
+ blktap_device_end_request(tap, request, err);
+ return;
+
+invalid:
+ dev_warn(ring->dev,
+ "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
+ usr_idx, rsp->status,
+ rsp->operation, request->operation,
+ err);
+ if (request)
+ goto end_request;
+}
+
+static void
+blktap_read_ring(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_response rsp;
+ RING_IDX rc, rp;
+
+ down_read(&current->mm->mmap_sem);
+ if (!ring->vma) {
+ up_read(&current->mm->mmap_sem);
+ return;
+ }
+
+ /* for each outstanding message on the ring */
+ rp = ring->ring.sring->rsp_prod;
+ rmb();
+
+ for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+ memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
+ blktap_ring_read_response(tap, &rsp);
+ }
+
+ ring->ring.rsp_cons = rc;
+
+ up_read(&current->mm->mmap_sem);
+}
+
+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static void
+blktap_ring_fail_pending(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx;
+
+ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+ request = ring->pending[usr_idx];
+ if (!request)
+ continue;
+
+ blktap_device_end_request(tap, request, -EIO);
+ }
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+ struct blktap *tap = vma->vm_private_data;
+ struct blktap_ring *ring = &tap->ring;
+ struct page *page = virt_to_page(ring->ring.sring);
+
+ blktap_ring_fail_pending(tap);
+
+ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+ ClearPageReserved(page);
+ __free_page(page);
+
+ ring->vma = NULL;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ blktap_control_destroy_tap(tap);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+ .close = blktap_ring_vm_close,
+ .fault = blktap_ring_fault,
+};
+
+int
+blktap_ring_map_segment(struct blktap *tap,
+ struct blktap_request *request,
+ int seg)
+{
+ struct blktap_ring *ring = &tap->ring;
+ unsigned long uaddr;
+
+ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+ return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
+}
+
+int
+blktap_ring_map_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ int seg, err = 0;
+ int write;
+
+ write = request->operation == BLKIF_OP_WRITE;
+
+ for (seg = 0; seg < request->nr_pages; seg++) {
+ if (write)
+ blktap_request_bounce(tap, request, seg, write);
+
+ err = blktap_ring_map_segment(tap, request, seg);
+ if (err)
+ break;
+ }
+
+ if (err)
+ blktap_ring_unmap_request(tap, request);
+
+ return err;
+}
+
+void
+blktap_ring_unmap_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+ unsigned long uaddr;
+ unsigned size;
+ int seg, read;
+
+ uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
+ size = request->nr_pages << PAGE_SHIFT;
+ read = request->operation == BLKIF_OP_READ;
+
+ if (read)
+ for (seg = 0; seg < request->nr_pages; seg++)
+ blktap_request_bounce(tap, request, seg, !read);
+
+ zap_page_range(ring->vma, uaddr, size, NULL);
+}
+
+void
+blktap_ring_free_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ ring->pending[request->usr_idx] = NULL;
+ ring->n_pending--;
+
+ blktap_request_free(tap, request);
+}
+
+struct blktap_request*
+blktap_ring_make_request(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blktap_request *request;
+ int usr_idx;
+
+ if (RING_FULL(&ring->ring))
+ return ERR_PTR(-ENOSPC);
+
+ request = blktap_request_alloc(tap);
+ if (!request)
+ return ERR_PTR(-ENOMEM);
+
+ for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
+ if (!ring->pending[usr_idx])
+ break;
+
+ BUG_ON(usr_idx >= BLK_RING_SIZE);
+
+ request->tap = tap;
+ request->usr_idx = usr_idx;
+
+ ring->pending[usr_idx] = request;
+ ring->n_pending++;
+
+ return request;
+}
+
+void
+blktap_ring_submit_request(struct blktap *tap,
+ struct blktap_request *request)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_request *breq;
+ struct scatterlist *sg;
+ int i, nsecs = 0;
+
+ dev_dbg(ring->dev,
+ "request %d [%p] submit\n", request->usr_idx, request);
+
+ breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+
+ breq->id = request->usr_idx;
+ breq->sector_number = blk_rq_pos(request->rq);
+ breq->handle = 0;
+ breq->operation = request->operation;
+ breq->nr_segments = request->nr_pages;
+
+ blktap_for_each_sg(sg, request, i) {
+ struct blkif_request_segment *seg = &breq->seg[i];
+ int first, count;
+
+ count = sg->length >> 9;
+ first = sg->offset >> 9;
+
+ seg->first_sect = first;
+ seg->last_sect = first + count - 1;
+
+ nsecs += count;
+ }
+
+ ring->ring.req_prod_pvt++;
+
+ do_gettimeofday(&request->time);
+
+
+ if (request->operation == BLKIF_OP_WRITE) {
+ tap->stats.st_wr_sect += nsecs;
+ tap->stats.st_wr_req++;
+ }
+
+ if (request->operation == BLKIF_OP_READ) {
+ tap->stats.st_rd_sect += nsecs;
+ tap->stats.st_rd_req++;
+ }
+}
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+ struct blktap *tap = NULL;
+ int minor;
+
+ minor = iminor(inode);
+
+ if (minor < blktap_max_minor)
+ tap = blktaps[minor];
+
+ if (!tap)
+ return -ENXIO;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ return -ENXIO;
+
+ if (tap->ring.task)
+ return -EBUSY;
+
+ filp->private_data = tap;
+ tap->ring.task = current;
+
+ return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+ struct blktap *tap = filp->private_data;
+
+ blktap_device_destroy_sync(tap);
+
+ tap->ring.task = NULL;
+
+ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ blktap_control_destroy_tap(tap);
+
+ return 0;
+}
+
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+ struct blkif_sring *sring;
+ struct page *page = NULL;
+ int err;
+
+ if (ring->vma)
+ return -EBUSY;
+
+ page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ SetPageReserved(page);
+
+ err = vm_insert_page(vma, vma->vm_start, page);
+ if (err)
+ goto fail;
+
+ sring = page_address(page);
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+ ring->ring_vstart = vma->vm_start;
+ ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
+
+ vma->vm_private_data = tap;
+
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_RESERVED;
+
+ vma->vm_ops = &blktap_ring_vm_operations;
+
+ ring->vma = vma;
+ return 0;
+
+fail:
+ if (page) {
+ zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+ ClearPageReserved(page);
+ __free_page(page);
+ }
+
+ return err;
+}
+
+static int
+blktap_ring_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+
+ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+ if (!ring->vma || ring->vma->vm_mm != current->mm)
+ return -EACCES;
+
+ switch(cmd) {
+ case BLKTAP2_IOCTL_KICK_FE:
+
+ blktap_read_ring(tap);
+ return 0;
+
+ case BLKTAP2_IOCTL_CREATE_DEVICE: {
+ struct blktap_params params;
+ void __user *ptr = (void *)arg;
+
+ if (!arg)
+ return -EINVAL;
+
+ if (copy_from_user(&params, ptr, sizeof(params)))
+ return -EFAULT;
+
+ return blktap_device_create(tap, &params);
+ }
+
+ case BLKTAP2_IOCTL_REMOVE_DEVICE:
+
+ return blktap_device_destroy(tap);
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+ struct blktap *tap = filp->private_data;
+ struct blktap_ring *ring = &tap->ring;
+ int work;
+
+ poll_wait(filp, &tap->pool->wait, wait);
+ poll_wait(filp, &ring->poll_wait, wait);
+
+ down_read(&current->mm->mmap_sem);
+ if (ring->vma && tap->device.gd)
+ blktap_device_run_queue(tap);
+ up_read(&current->mm->mmap_sem);
+
+ work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
+ RING_PUSH_REQUESTS(&ring->ring);
+
+ if (work ||
+ ring->ring.sring->private.tapif_user.msg ||
+ test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static struct file_operations blktap_ring_file_operations = {
+ .owner = THIS_MODULE,
+ .open = blktap_ring_open,
+ .release = blktap_ring_release,
+ .ioctl = blktap_ring_ioctl,
+ .mmap = blktap_ring_mmap,
+ .poll = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+ wake_up(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ if (ring->task || ring->vma)
+ return -EBUSY;
+
+ return 0;
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+
+ init_waitqueue_head(&ring->poll_wait);
+ ring->devno = MKDEV(blktap_ring_major, tap->minor);
+
+ return 0;
+}
+
+size_t
+blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
+{
+ struct blktap_ring *ring = &tap->ring;
+ char *s = buf, *end = buf + size;
+ int usr_idx;
+
+ s += snprintf(s, end - s,
+ "begin pending:%d\n", ring->n_pending);
+
+ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+ struct blktap_request *request;
+ struct timeval *time;
+ int write;
+
+ request = ring->pending[usr_idx];
+ if (!request)
+ continue;
+
+ write = request->operation == BLKIF_OP_WRITE;
+ time = &request->time;
+
+ s += snprintf(s, end - s,
+ "%02d: usr_idx:%02d "
+ "op:%c nr_pages:%02d time:%lu.%09lu\n",
+ usr_idx, request->usr_idx,
+ write ? 'W' : 'R', request->nr_pages,
+ time->tv_sec, time->tv_usec);
+ }
+
+ s += snprintf(s, end - s, "end pending\n");
+
+ return s - buf;
+}
+
+
+int __init
+blktap_ring_init(void)
+{
+ dev_t dev = 0;
+ int err;
+
+ cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations);
+ blktap_ring_cdev.owner = THIS_MODULE;
+
+ err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2");
+ if (err < 0) {
+ BTERR("error registering ring devices: %d\n", err);
+ return err;
+ }
+
+ err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE);
+ if (err) {
+ BTERR("error adding ring device: %d\n", err);
+ unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE);
+ return err;
+ }
+
+ blktap_ring_major = MAJOR(dev);
+ BTINFO("blktap ring major: %d\n", blktap_ring_major);
+
+ return 0;
+}
+
+void
+blktap_ring_exit(void)
+{
+ if (!blktap_ring_major)
+ return;
+
+ cdev_del(&blktap_ring_cdev);
+ unregister_chrdev_region(MKDEV(blktap_ring_major, 0),
+ MAX_BLKTAP_DEVICE);
+
+ blktap_ring_major = 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/xen/blktap2-new/sysfs.c 2011-02-24 13:49:49.000000000 +0100
@@ -0,0 +1,288 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct blktap *tap;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (size >= BLKTAP2_MAX_MESSAGE_LEN)
+ return -ENAMETOOLONG;
+
+ if (strnlen(buf, size) != size)
+ return -EINVAL;
+
+ strcpy(tap->name, buf);
+
+ return size;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ ssize_t size;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (tap->name[0])
+ size = sprintf(buf, "%s\n", tap->name);
+ else
+ size = sprintf(buf, "%d\n", tap->minor);
+
+ return size;
+}
+static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
+ blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static void
+blktap_sysfs_remove_work(struct work_struct *work)
+{
+ struct blktap *tap
+ = container_of(work, struct blktap, remove_work);
+ blktap_control_destroy_tap(tap);
+}
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap *tap;
+ int err;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return size;
+
+ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+ goto wait;
+
+ if (tap->ring.vma) {
+ struct blkif_sring *sring = tap->ring.ring.sring;
+ sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
+ blktap_ring_kick_user(tap);
+ } else {
+ INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
+ schedule_work(&tap->remove_work);
+ }
+wait:
+ err = wait_event_interruptible(tap->remove_wait,
+ !dev_get_drvdata(dev));
+ if (err)
+ return err;
+
+ return size;
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ char *s = buf, *end = buf + PAGE_SIZE;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ s += blktap_control_debug(tap, s, end - s);
+
+ s += blktap_request_debug(tap, s, end - s);
+
+ s += blktap_device_debug(tap, s, end - s);
+
+ s += blktap_ring_debug(tap, s, end - s);
+
+ return s - buf;
+}
+static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
+
+static ssize_t
+blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct blktap *tap;
+ ssize_t rv = 0;
+
+ tap = dev_get_drvdata(dev);
+ if (!tap)
+ return 0;
+
+ if (tap->ring.task)
+ rv = sprintf(buf, "%d\n", tap->ring.task->pid);
+
+ return rv;
+}
+static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
+
+static ssize_t
+blktap_sysfs_show_pool(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct blktap *tap = dev_get_drvdata(dev);
+ return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
+}
+
+static ssize_t
+blktap_sysfs_store_pool(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct blktap *tap = dev_get_drvdata(dev);
+ struct blktap_page_pool *pool, *tmp = tap->pool;
+
+ if (tap->device.gd)
+ return -EBUSY;
+
+ pool = blktap_page_pool_get(buf);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ tap->pool = pool;
+ kobject_put(&tmp->kobj);
+
+ return size;
+}
+DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
+ blktap_sysfs_show_pool, blktap_sysfs_store_pool);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct device *dev;
+ int err = 0;
+
+ init_waitqueue_head(&tap->remove_wait);
+
+ dev = device_create(class, NULL, ring->devno,
+ tap, "blktap%d", tap->minor);
+ if (IS_ERR(dev))
+ err = PTR_ERR(dev);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_name);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_remove);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_debug);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_task);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_pool);
+ if (!err)
+ ring->dev = dev;
+ else
+ device_unregister(dev);
+
+ return err;
+}
+
+void
+blktap_sysfs_destroy(struct blktap *tap)
+{
+ struct blktap_ring *ring = &tap->ring;
+ struct device *dev;
+
+ dev = ring->dev;
+
+ if (!dev)
+ return;
+
+ dev_set_drvdata(dev, NULL);
+ wake_up(&tap->remove_wait);
+
+ device_unregister(dev);
+ ring->dev = NULL;
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
+{
+ int level;
+
+ if (sscanf(buf, "%d", &level) == 1) {
+ blktap_debug_level = level;
+ return size;
+ }
+
+ return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
+ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, char *buf)
+{
+ int i, ret;
+ struct blktap *tap;
+
+ mutex_lock(&blktap_lock);
+
+ ret = 0;
+ for (i = 0; i < blktap_max_minor; i++) {
+ tap = blktaps[i];
+ if (!tap)
+ continue;
+
+ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+ continue;
+
+ ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
+ }
+
+ mutex_unlock(&blktap_lock);
+
+ return ret;
+}
+static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
+
+void
+blktap_sysfs_exit(void)
+{
+ if (class)
+ class_destroy(class);
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+ struct class *cls;
+ int err = 0;
+
+ cls = class_create(THIS_MODULE, "blktap2");
+ if (IS_ERR(cls))
+ err = PTR_ERR(cls);
+ if (!err)
+ err = class_create_file(cls, &class_attr_verbosity);
+ if (!err)
+ err = class_create_file(cls, &class_attr_devices);
+ if (!err)
+ class = cls;
+ else
+ class_destroy(cls);
+
+ return err;
+}