Subject: pv-ops blktap2 From: https://git.kernel.org/?p=linux/kernel/git/jeremy/xen.git (commit 892d2f052e979cf1916647c752b94cf62ec1c6dc) Patch-mainline: n/a Acked-by: jbeulich@novell.com --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/Makefile 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o + +blktap-objs := control.o ring.o device.o request.o sysfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/blktap.h 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,209 @@ +#ifndef _BLKTAP_H_ +#define _BLKTAP_H_ + +#include +#include +#include +#include +#include +#include + +extern int blktap_debug_level; +extern int blktap_ring_major; +extern int blktap_device_major; + +#define BTPRINTK(level, tag, force, _f, _a...) \ + do { \ + if (blktap_debug_level > level && \ + (force || printk_ratelimit())) \ + printk(tag "%s: " _f, __func__, ##_a); \ + } while (0) + +#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) +#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) +#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) +#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) + +#define MAX_BLKTAP_DEVICE 1024 + +#define BLKTAP_DEVICE 4 +#define BLKTAP_DEVICE_CLOSED 5 +#define BLKTAP_SHUTDOWN_REQUESTED 8 + +/* blktap IOCTLs: */ +#define BLKTAP2_IOCTL_KICK_FE 1 +#define BLKTAP2_IOCTL_ALLOC_TAP 200 +#define BLKTAP2_IOCTL_FREE_TAP 201 +#define BLKTAP2_IOCTL_CREATE_DEVICE 202 +#define BLKTAP2_IOCTL_REMOVE_DEVICE 207 + +#define BLKTAP2_MAX_MESSAGE_LEN 256 + +#define BLKTAP2_RING_MESSAGE_CLOSE 3 + +#define BLKTAP_REQUEST_FREE 0 +#define BLKTAP_REQUEST_PENDING 1 + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req, _seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +struct grant_handle_pair { + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +struct blktap_handle { + unsigned int ring; + unsigned int device; + unsigned int minor; +}; + +struct blktap_params { + char name[BLKTAP2_MAX_MESSAGE_LEN]; + unsigned long long capacity; + unsigned long sector_size; +}; + +struct blktap_device { + spinlock_t lock; + struct gendisk *gd; +}; + +struct blktap_ring { + struct task_struct *task; + + struct vm_area_struct *vma; + struct blkif_front_ring ring; + unsigned long ring_vstart; + unsigned long user_vstart; + + int n_pending; + struct blktap_request *pending[MAX_PENDING_REQS]; + + wait_queue_head_t poll_wait; + + dev_t devno; + struct device *dev; +}; + +struct blktap_statistics { + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + s64 st_rd_cnt; + s64 st_rd_sum_usecs; + s64 st_rd_max_usecs; + s64 st_wr_cnt; + s64 st_wr_sum_usecs; + s64 st_wr_max_usecs; +}; + +struct blktap_request { + struct blktap *tap; + struct request *rq; + int usr_idx; + + int operation; + struct timeval time; + + struct scatterlist sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int nr_pages; +}; + +#define blktap_for_each_sg(_sg, _req, _i) \ + for (_sg = (_req)->sg_table, _i = 0; \ + _i < (_req)->nr_pages; \ + (_sg)++, (_i)++) + +struct blktap { + int minor; + unsigned long dev_inuse; + + struct blktap_ring ring; + struct blktap_device device; + struct blktap_page_pool *pool; + + wait_queue_head_t remove_wait; + struct work_struct remove_work; + char name[BLKTAP2_MAX_MESSAGE_LEN]; + + struct blktap_statistics stats; +}; + +struct blktap_page_pool { + struct mempool_s *bufs; + spinlock_t lock; + struct kobject kobj; + wait_queue_head_t wait; +}; + +extern struct mutex blktap_lock; +extern struct blktap **blktaps; +extern int blktap_max_minor; + +int blktap_control_destroy_tap(struct blktap *); +size_t blktap_control_debug(struct blktap *, char *, size_t); + +int blktap_ring_init(void); +void blktap_ring_exit(void); +size_t blktap_ring_debug(struct blktap *, char *, size_t); +int blktap_ring_create(struct blktap *); +int blktap_ring_destroy(struct blktap *); +struct blktap_request *blktap_ring_make_request(struct blktap *); +void blktap_ring_free_request(struct blktap *,struct blktap_request *); +void blktap_ring_submit_request(struct blktap *, struct blktap_request *); +int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int); +int blktap_ring_map_request(struct blktap *, struct blktap_request *); +void blktap_ring_unmap_request(struct blktap *, struct blktap_request *); +void blktap_ring_set_message(struct blktap *, int); +void blktap_ring_kick_user(struct blktap *); + +int blktap_sysfs_init(void); +void blktap_sysfs_exit(void); +int blktap_sysfs_create(struct blktap *); +void blktap_sysfs_destroy(struct blktap *); + +int blktap_device_init(void); +void blktap_device_exit(void); +size_t blktap_device_debug(struct blktap *, char *, size_t); +int blktap_device_create(struct blktap *, struct blktap_params *); +int blktap_device_destroy(struct blktap *); +void blktap_device_destroy_sync(struct blktap *); +void blktap_device_run_queue(struct blktap *); +void blktap_device_end_request(struct blktap *, struct blktap_request *, int); + +int blktap_page_pool_init(struct kobject *); +void blktap_page_pool_exit(void); +struct blktap_page_pool *blktap_page_pool_get(const char *); + +size_t blktap_request_debug(struct blktap *, char *, size_t); +struct blktap_request *blktap_request_alloc(struct blktap *); +int blktap_request_get_pages(struct blktap *, struct blktap_request *, int); +void blktap_request_free(struct blktap *, struct blktap_request *); +void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int); + + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/control.c 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,315 @@ +#include +#include +#include +#include +#include + +#include "blktap.h" + +DEFINE_MUTEX(blktap_lock); + +struct blktap **blktaps; +int blktap_max_minor; +static struct blktap_page_pool *default_pool; + +static struct blktap * +blktap_control_get_minor(void) +{ + int minor; + struct blktap *tap; + + tap = kzalloc(sizeof(*tap), GFP_KERNEL); + if (unlikely(!tap)) + return NULL; + + mutex_lock(&blktap_lock); + + for (minor = 0; minor < blktap_max_minor; minor++) + if (!blktaps[minor]) + break; + + if (minor == MAX_BLKTAP_DEVICE) + goto fail; + + if (minor == blktap_max_minor) { + void *p; + int n; + + n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE); + p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL); + if (!p) + goto fail; + + blktaps = p; + minor = blktap_max_minor; + blktap_max_minor = n; + + memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0])); + } + + tap->minor = minor; + blktaps[minor] = tap; + + __module_get(THIS_MODULE); +out: + mutex_unlock(&blktap_lock); + return tap; + +fail: + mutex_unlock(&blktap_lock); + kfree(tap); + tap = NULL; + goto out; +} + +static void +blktap_control_put_minor(struct blktap* tap) +{ + blktaps[tap->minor] = NULL; + kfree(tap); + + module_put(THIS_MODULE); +} + +static struct blktap* +blktap_control_create_tap(void) +{ + struct blktap *tap; + int err; + + tap = blktap_control_get_minor(); + if (!tap) + return NULL; + + kobject_get(&default_pool->kobj); + tap->pool = default_pool; + + err = blktap_ring_create(tap); + if (err) + goto fail_tap; + + err = blktap_sysfs_create(tap); + if (err) + goto fail_ring; + + return tap; + +fail_ring: + blktap_ring_destroy(tap); +fail_tap: + blktap_control_put_minor(tap); + + return NULL; +} + +int +blktap_control_destroy_tap(struct blktap *tap) +{ + int err; + + err = blktap_ring_destroy(tap); + if (err) + return err; + + kobject_put(&tap->pool->kobj); + + blktap_sysfs_destroy(tap); + + blktap_control_put_minor(tap); + + return 0; +} + +static int +blktap_control_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct blktap *tap; + + switch (cmd) { + case BLKTAP2_IOCTL_ALLOC_TAP: { + struct blktap_handle h; + void __user *ptr = (void __user*)arg; + + tap = blktap_control_create_tap(); + if (!tap) + return -ENOMEM; + + h.ring = blktap_ring_major; + h.device = blktap_device_major; + h.minor = tap->minor; + + if (copy_to_user(ptr, &h, sizeof(h))) { + blktap_control_destroy_tap(tap); + return -EFAULT; + } + + return 0; + } + + case BLKTAP2_IOCTL_FREE_TAP: { + int minor = arg; + + if (minor > MAX_BLKTAP_DEVICE) + return -EINVAL; + + tap = blktaps[minor]; + if (!tap) + return -ENODEV; + + return blktap_control_destroy_tap(tap); + } + } + + return -ENOIOCTLCMD; +} + +static struct file_operations blktap_control_file_operations = { + .owner = THIS_MODULE, + .ioctl = blktap_control_ioctl, +}; + +static struct miscdevice blktap_control = { + .minor = MISC_DYNAMIC_MINOR, + .name = "blktap-control", + .fops = &blktap_control_file_operations, +}; + +static struct device *control_device; + +static ssize_t +blktap_control_show_default_pool(struct device *device, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s", kobject_name(&default_pool->kobj)); +} + +static ssize_t +blktap_control_store_default_pool(struct device *device, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct blktap_page_pool *pool, *tmp = default_pool; + + pool = blktap_page_pool_get(buf); + if (IS_ERR(pool)) + return PTR_ERR(pool); + + default_pool = pool; + kobject_put(&tmp->kobj); + + return size; +} + +static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH, + blktap_control_show_default_pool, + blktap_control_store_default_pool); + +size_t +blktap_control_debug(struct blktap *tap, char *buf, size_t size) +{ + char *s = buf, *end = buf + size; + + s += snprintf(s, end - s, + "tap %u:%u name:'%s' flags:%#08lx\n", + MAJOR(tap->ring.devno), MINOR(tap->ring.devno), + tap->name, tap->dev_inuse); + + return s - buf; +} + +static int __init +blktap_control_init(void) +{ + int err; + + err = misc_register(&blktap_control); + if (err) + return err; + + control_device = blktap_control.this_device; + + blktap_max_minor = min(64, MAX_BLKTAP_DEVICE); + blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL); + if (!blktaps) { + BTERR("failed to allocate blktap minor map"); + return -ENOMEM; + } + + err = blktap_page_pool_init(&control_device->kobj); + if (err) + return err; + + default_pool = blktap_page_pool_get("default"); + if (!default_pool) + return -ENOMEM; + + err = device_create_file(control_device, &dev_attr_default_pool); + if (err) + return err; + + return 0; +} + +static void +blktap_control_exit(void) +{ + if (default_pool) { + kobject_put(&default_pool->kobj); + default_pool = NULL; + } + + blktap_page_pool_exit(); + + if (blktaps) { + kfree(blktaps); + blktaps = NULL; + } + + if (control_device) { + misc_deregister(&blktap_control); + control_device = NULL; + } +} + +static void +blktap_exit(void) +{ + blktap_control_exit(); + blktap_ring_exit(); + blktap_sysfs_exit(); + blktap_device_exit(); +} + +static int __init +blktap_init(void) +{ + int err; + + err = blktap_device_init(); + if (err) + goto fail; + + err = blktap_ring_init(); + if (err) + goto fail; + + err = blktap_sysfs_init(); + if (err) + goto fail; + + err = blktap_control_init(); + if (err) + goto fail; + + return 0; + +fail: + blktap_exit(); + return err; +} + +module_init(blktap_init); +module_exit(blktap_exit); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/device.c 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,564 @@ +#include +#include +#include +#include +#include +#include + +#include "blktap.h" + +int blktap_device_major; + +#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device) + +static int +blktap_device_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct blktap_device *tapdev = disk->private_data; + + if (!tapdev) + return -ENXIO; + + /* NB. we might have bounced a bd trylock by tapdisk. when + * failing for reasons not !tapdev, make sure to kick tapdisk + * out of destroy wait state again. */ + + return 0; +} + +static int +blktap_device_release(struct gendisk *disk, fmode_t mode) +{ + struct blktap_device *tapdev = disk->private_data; + struct block_device *bdev = bdget_disk(disk, 0); + struct blktap *tap = dev_to_blktap(tapdev); + + bdput(bdev); + + if (!bdev->bd_openers) { + set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse); + blktap_ring_kick_user(tap); + } + + return 0; +} + +static int +blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + +static int +blktap_device_ioctl(struct block_device *bd, fmode_t mode, + unsigned command, unsigned long argument) +{ + int i; + + switch (command) { + case CDROMMULTISESSION: + BTDBG("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_IDLUN: + if (!access_ok(VERIFY_WRITE, argument, + sizeof(struct scsi_idlun))) + return -EFAULT; + + /* return 0 for now. */ + __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); + __put_user(0, + &((struct scsi_idlun __user *)argument)->host_unique_id); + return 0; + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + +static struct block_device_operations blktap_device_file_operations = { + .owner = THIS_MODULE, + .open = blktap_device_open, + .release = blktap_device_release, + .ioctl = blktap_device_ioctl, + .getgeo = blktap_device_getgeo +}; + +/* NB. __blktap holding the queue lock; blktap where unlocked */ + +static inline struct request* +__blktap_next_queued_rq(struct request_queue *q) +{ + return blk_peek_request(q); +} + +static inline void +__blktap_dequeue_rq(struct request *rq) +{ + blk_start_request(rq); +} + +/* NB. err == 0 indicates success, failures < 0 */ + +static inline void +__blktap_end_queued_rq(struct request *rq, int err) +{ + blk_start_request(rq); + __blk_end_request(rq, err, blk_rq_bytes(rq)); +} + +static inline void +__blktap_end_rq(struct request *rq, int err) +{ + __blk_end_request(rq, err, blk_rq_bytes(rq)); +} + +static inline void +blktap_end_rq(struct request *rq, int err) +{ + spin_lock_irq(rq->q->queue_lock); + __blktap_end_rq(rq, err); + spin_unlock_irq(rq->q->queue_lock); +} + +void +blktap_device_end_request(struct blktap *tap, + struct blktap_request *request, + int error) +{ + struct blktap_device *tapdev = &tap->device; + struct request *rq = request->rq; + + blktap_ring_unmap_request(tap, request); + + blktap_ring_free_request(tap, request); + + dev_dbg(disk_to_dev(tapdev->gd), + "end_request: op=%d error=%d bytes=%d\n", + rq_data_dir(rq), error, blk_rq_bytes(rq)); + + blktap_end_rq(rq, error); +} + +int +blktap_device_make_request(struct blktap *tap, struct request *rq) +{ + struct blktap_device *tapdev = &tap->device; + struct blktap_request *request; + int write, nsegs; + int err; + + request = blktap_ring_make_request(tap); + if (IS_ERR(request)) { + err = PTR_ERR(request); + request = NULL; + + if (err == -ENOSPC || err == -ENOMEM) + goto stop; + + goto fail; + } + + write = rq_data_dir(rq) == WRITE; + nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table); + + dev_dbg(disk_to_dev(tapdev->gd), + "make_request: op=%c bytes=%d nsegs=%d\n", + write ? 'w' : 'r', blk_rq_bytes(rq), nsegs); + + request->rq = rq; + request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ; + + err = blktap_request_get_pages(tap, request, nsegs); + if (err) + goto stop; + + err = blktap_ring_map_request(tap, request); + if (err) + goto fail; + + blktap_ring_submit_request(tap, request); + + return 0; + +stop: + tap->stats.st_oo_req++; + err = -EBUSY; + +_out: + if (request) + blktap_ring_free_request(tap, request); + + return err; +fail: + if (printk_ratelimit()) + dev_warn(disk_to_dev(tapdev->gd), + "make request: %d, failing\n", err); + goto _out; +} + +/* + * called from tapdisk context + */ +void +blktap_device_run_queue(struct blktap *tap) +{ + struct blktap_device *tapdev = &tap->device; + struct request_queue *q; + struct request *rq; + int err; + + if (!tapdev->gd) + return; + + q = tapdev->gd->queue; + + spin_lock_irq(&tapdev->lock); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + + do { + rq = __blktap_next_queued_rq(q); + if (!rq) + break; + + if (!blk_fs_request(rq)) { + __blktap_end_queued_rq(rq, -EOPNOTSUPP); + continue; + } + + spin_unlock_irq(&tapdev->lock); + + err = blktap_device_make_request(tap, rq); + + spin_lock_irq(&tapdev->lock); + + if (err == -EBUSY) { + blk_stop_queue(q); + break; + } + + __blktap_dequeue_rq(rq); + + if (unlikely(err)) + __blktap_end_rq(rq, err); + } while (1); + + spin_unlock_irq(&tapdev->lock); +} + +static void +blktap_device_do_request(struct request_queue *rq) +{ + struct blktap_device *tapdev = rq->queuedata; + struct blktap *tap = dev_to_blktap(tapdev); + + blktap_ring_kick_user(tap); +} + +static void +blktap_device_configure(struct blktap *tap, + struct blktap_params *params) +{ + struct request_queue *rq; + struct blktap_device *dev = &tap->device; + + dev = &tap->device; + rq = dev->gd->queue; + + spin_lock_irq(&dev->lock); + + set_capacity(dev->gd, params->capacity); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_logical_block_size(rq, params->sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + /* We are reordering, but cacheless. */ + blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL); + + spin_unlock_irq(&dev->lock); +} + +static int +blktap_device_validate_params(struct blktap *tap, + struct blktap_params *params) +{ + struct device *dev = tap->ring.dev; + int sector_order, name_sz; + + sector_order = ffs(params->sector_size) - 1; + + if (sector_order < 9 || + sector_order > 12 || + params->sector_size != 1U<capacity || + (params->capacity > ULLONG_MAX >> sector_order)) + goto fail; + + name_sz = min(sizeof(params->name), sizeof(tap->name)); + if (strnlen(params->name, name_sz) >= name_sz) + goto fail; + + return 0; + +fail: + params->name[name_sz-1] = 0; + dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n", + params->capacity, params->sector_size, params->name); + return -EINVAL; +} + +int +blktap_device_destroy(struct blktap *tap) +{ + struct blktap_device *tapdev = &tap->device; + struct block_device *bdev; + struct gendisk *gd; + int err; + + gd = tapdev->gd; + if (!gd) + return 0; + + bdev = bdget_disk(gd, 0); + + err = !mutex_trylock(&bdev->bd_mutex); + if (err) { + /* NB. avoid a deadlock. the last opener syncs the + * bdev holding bd_mutex. */ + err = -EBUSY; + goto out_nolock; + } + + if (bdev->bd_openers) { + err = -EBUSY; + goto out; + } + + del_gendisk(gd); + gd->private_data = NULL; + + blk_cleanup_queue(gd->queue); + + put_disk(gd); + tapdev->gd = NULL; + + clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); + err = 0; +out: + mutex_unlock(&bdev->bd_mutex); +out_nolock: + bdput(bdev); + + return err; +} + +static void +blktap_device_fail_queue(struct blktap *tap) +{ + struct blktap_device *tapdev = &tap->device; + struct request_queue *q = tapdev->gd->queue; + + spin_lock_irq(&tapdev->lock); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + + do { + struct request *rq = __blktap_next_queued_rq(q); + if (!rq) + break; + + __blktap_end_queued_rq(rq, -EIO); + } while (1); + + spin_unlock_irq(&tapdev->lock); +} + +static int +blktap_device_try_destroy(struct blktap *tap) +{ + int err; + + err = blktap_device_destroy(tap); + if (err) + blktap_device_fail_queue(tap); + + return err; +} + +void +blktap_device_destroy_sync(struct blktap *tap) +{ + wait_event(tap->ring.poll_wait, + !blktap_device_try_destroy(tap)); +} + +int +blktap_device_create(struct blktap *tap, struct blktap_params *params) +{ + int minor, err; + struct gendisk *gd; + struct request_queue *rq; + struct blktap_device *tapdev; + + gd = NULL; + rq = NULL; + tapdev = &tap->device; + minor = tap->minor; + + if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return -EEXIST; + + if (blktap_device_validate_params(tap, params)) + return -EINVAL; + + gd = alloc_disk(1); + if (!gd) { + err = -ENOMEM; + goto fail; + } + + if (minor < 26) { + sprintf(gd->disk_name, "td%c", 'a' + minor % 26); + } else if (minor < (26 + 1) * 26) { + sprintf(gd->disk_name, "td%c%c", + 'a' + minor / 26 - 1,'a' + minor % 26); + } else { + const unsigned int m1 = (minor / 26 - 1) / 26 - 1; + const unsigned int m2 = (minor / 26 - 1) % 26; + const unsigned int m3 = minor % 26; + sprintf(gd->disk_name, "td%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); + } + + gd->major = blktap_device_major; + gd->first_minor = minor; + gd->fops = &blktap_device_file_operations; + gd->private_data = tapdev; + + spin_lock_init(&tapdev->lock); + rq = blk_init_queue(blktap_device_do_request, &tapdev->lock); + if (!rq) { + err = -ENOMEM; + goto fail; + } + elevator_init(rq, "noop"); + + gd->queue = rq; + rq->queuedata = tapdev; + tapdev->gd = gd; + + blktap_device_configure(tap, params); + add_disk(gd); + + if (params->name[0]) + strncpy(tap->name, params->name, sizeof(tap->name)-1); + + set_bit(BLKTAP_DEVICE, &tap->dev_inuse); + + dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n", + queue_logical_block_size(rq), + (unsigned long long)get_capacity(gd)); + + return 0; + +fail: + if (gd) + del_gendisk(gd); + if (rq) + blk_cleanup_queue(rq); + + return err; +} + +size_t +blktap_device_debug(struct blktap *tap, char *buf, size_t size) +{ + struct gendisk *disk = tap->device.gd; + struct request_queue *q; + struct block_device *bdev; + char *s = buf, *end = buf + size; + + if (!disk) + return 0; + + q = disk->queue; + + s += snprintf(s, end - s, + "disk capacity:%llu sector size:%u\n", + (unsigned long long)get_capacity(disk), + queue_logical_block_size(q)); + + s += snprintf(s, end - s, + "queue flags:%#lx plugged:%d stopped:%d empty:%d\n", + q->queue_flags, + blk_queue_plugged(q), blk_queue_stopped(q), + elv_queue_empty(q)); + + bdev = bdget_disk(disk, 0); + if (bdev) { + s += snprintf(s, end - s, + "bdev openers:%d closed:%d\n", + bdev->bd_openers, + test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse)); + bdput(bdev); + } + + return s - buf; +} + +int __init +blktap_device_init() +{ + int major; + + /* Dynamically allocate a major for this device */ + major = register_blkdev(0, "tapdev"); + if (major < 0) { + BTERR("Couldn't register blktap device\n"); + return -ENOMEM; + } + + blktap_device_major = major; + BTINFO("blktap device major %d\n", major); + + return 0; +} + +void +blktap_device_exit(void) +{ + if (blktap_device_major) + unregister_blkdev(blktap_device_major, "tapdev"); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/request.c 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,418 @@ +#include +#include +#include +#include +#include + +#include "blktap.h" + +/* max pages per shared pool. just to prevent accidental dos. */ +#define POOL_MAX_PAGES (256*BLKIF_MAX_SEGMENTS_PER_REQUEST) + +/* default page pool size. when considering to shrink a shared pool, + * note that paused tapdisks may grab a whole lot of pages for a long + * time. */ +#define POOL_DEFAULT_PAGES (2 * MMAP_PAGES) + +/* max number of pages allocatable per request. */ +#define POOL_MAX_REQUEST_PAGES BLKIF_MAX_SEGMENTS_PER_REQUEST + +/* min request structs per pool. These grow dynamically. */ +#define POOL_MIN_REQS BLK_RING_SIZE + +static struct kset *pool_set; + +#define kobj_to_pool(_kobj) \ + container_of(_kobj, struct blktap_page_pool, kobj) + +static struct kmem_cache *request_cache; +static mempool_t *request_pool; + +static void +__page_pool_wake(struct blktap_page_pool *pool) +{ + mempool_t *mem = pool->bufs; + + /* + NB. slightly wasteful to always wait for a full segment + set. but this ensures the next disk makes + progress. presently, the repeated request struct + alloc/release cycles would otherwise keep everyone spinning. + */ + + if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES) + wake_up(&pool->wait); +} + +int +blktap_request_get_pages(struct blktap *tap, + struct blktap_request *request, int nr_pages) +{ + struct blktap_page_pool *pool = tap->pool; + mempool_t *mem = pool->bufs; + struct page *page; + + BUG_ON(request->nr_pages != 0); + BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES); + + if (mem->curr_nr < nr_pages) + return -ENOMEM; + + /* NB. avoid thundering herds of tapdisks colliding. */ + spin_lock(&pool->lock); + + if (mem->curr_nr < nr_pages) { + spin_unlock(&pool->lock); + return -ENOMEM; + } + + while (request->nr_pages < nr_pages) { + page = mempool_alloc(mem, GFP_NOWAIT); + BUG_ON(!page); + request->pages[request->nr_pages++] = page; + } + + spin_unlock(&pool->lock); + + return 0; +} + +static void +blktap_request_put_pages(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_page_pool *pool = tap->pool; + struct page *page; + + while (request->nr_pages) { + page = request->pages[--request->nr_pages]; + mempool_free(page, pool->bufs); + } +} + +size_t +blktap_request_debug(struct blktap *tap, char *buf, size_t size) +{ + struct blktap_page_pool *pool = tap->pool; + mempool_t *mem = pool->bufs; + char *s = buf, *end = buf + size; + + s += snprintf(buf, end - s, + "pool:%s pages:%d free:%d\n", + kobject_name(&pool->kobj), + mem->min_nr, mem->curr_nr); + + return s - buf; +} + +struct blktap_request* +blktap_request_alloc(struct blktap *tap) +{ + struct blktap_request *request; + + request = mempool_alloc(request_pool, GFP_NOWAIT); + if (request) + request->tap = tap; + + return request; +} + +void +blktap_request_free(struct blktap *tap, + struct blktap_request *request) +{ + blktap_request_put_pages(tap, request); + + mempool_free(request, request_pool); + + __page_pool_wake(tap->pool); +} + +void +blktap_request_bounce(struct blktap *tap, + struct blktap_request *request, + int seg, int write) +{ + struct scatterlist *sg = &request->sg_table[seg]; + void *s, *p; + + BUG_ON(seg >= request->nr_pages); + + s = sg_virt(sg); + p = page_address(request->pages[seg]) + sg->offset; + + if (write) + memcpy(p, s, sg->length); + else + memcpy(s, p, sg->length); +} + +static void +blktap_request_ctor(void *obj) +{ + struct blktap_request *request = obj; + + memset(request, 0, sizeof(*request)); + sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table)); +} + +static int +blktap_page_pool_resize(struct blktap_page_pool *pool, int target) +{ + mempool_t *bufs = pool->bufs; + int err; + + /* NB. mempool asserts min_nr >= 1 */ + target = max(1, target); + + err = mempool_resize(bufs, target, GFP_KERNEL); + if (err) + return err; + + __page_pool_wake(pool); + + return 0; +} + +struct pool_attribute { + struct attribute attr; + + ssize_t (*show)(struct blktap_page_pool *pool, + char *buf); + + ssize_t (*store)(struct blktap_page_pool *pool, + const char *buf, size_t count); +}; + +#define kattr_to_pool_attr(_kattr) \ + container_of(_kattr, struct pool_attribute, attr) + +static ssize_t +blktap_page_pool_show_size(struct blktap_page_pool *pool, + char *buf) +{ + mempool_t *mem = pool->bufs; + return sprintf(buf, "%d", mem->min_nr); +} + +static ssize_t +blktap_page_pool_store_size(struct blktap_page_pool *pool, + const char *buf, size_t size) +{ + int target; + + /* + * NB. target fixup to avoid undesired results. less than a + * full segment set can wedge the disk. much more than a + * couple times the physical queue depth is rarely useful. + */ + + target = simple_strtoul(buf, NULL, 0); + target = max(POOL_MAX_REQUEST_PAGES, target); + target = min(target, POOL_MAX_PAGES); + + return blktap_page_pool_resize(pool, target) ? : size; +} + +static struct pool_attribute blktap_page_pool_attr_size = + __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH, + blktap_page_pool_show_size, + blktap_page_pool_store_size); + +static ssize_t +blktap_page_pool_show_free(struct blktap_page_pool *pool, + char *buf) +{ + mempool_t *mem = pool->bufs; + return sprintf(buf, "%d", mem->curr_nr); +} + +static struct pool_attribute blktap_page_pool_attr_free = + __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH, + blktap_page_pool_show_free, + NULL); + +static struct attribute *blktap_page_pool_attrs[] = { + &blktap_page_pool_attr_size.attr, + &blktap_page_pool_attr_free.attr, + NULL, +}; + +static inline struct kobject* +__blktap_kset_find_obj(struct kset *kset, const char *name) +{ + struct kobject *k; + struct kobject *ret = NULL; + + spin_lock(&kset->list_lock); + list_for_each_entry(k, &kset->list, entry) { + if (kobject_name(k) && !strcmp(kobject_name(k), name)) { + ret = kobject_get(k); + break; + } + } + spin_unlock(&kset->list_lock); + return ret; +} + +static ssize_t +blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr, + char *buf) +{ + struct blktap_page_pool *pool = kobj_to_pool(kobj); + struct pool_attribute *attr = kattr_to_pool_attr(kattr); + + if (attr->show) + return attr->show(pool, buf); + + return -EIO; +} + +static ssize_t +blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr, + const char *buf, size_t size) +{ + struct blktap_page_pool *pool = kobj_to_pool(kobj); + struct pool_attribute *attr = kattr_to_pool_attr(kattr); + + if (attr->show) + return attr->store(pool, buf, size); + + return -EIO; +} + +static struct sysfs_ops blktap_page_pool_sysfs_ops = { + .show = blktap_page_pool_show_attr, + .store = blktap_page_pool_store_attr, +}; + +static void +blktap_page_pool_release(struct kobject *kobj) +{ + struct blktap_page_pool *pool = kobj_to_pool(kobj); + mempool_destroy(pool->bufs); + kfree(pool); +} + +struct kobj_type blktap_page_pool_ktype = { + .release = blktap_page_pool_release, + .sysfs_ops = &blktap_page_pool_sysfs_ops, + .default_attrs = blktap_page_pool_attrs, +}; + +static void* +__mempool_page_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct page *page; + + if (!(gfp_mask & __GFP_WAIT)) + return NULL; + + page = alloc_page(gfp_mask); + if (page) + SetPageReserved(page); + + return page; +} + +static void +__mempool_page_free(void *element, void *pool_data) +{ + struct page *page = element; + + ClearPageReserved(page); + put_page(page); +} + +static struct kobject* +blktap_page_pool_create(const char *name, int nr_pages) +{ + struct blktap_page_pool *pool; + int err; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + goto fail; + + spin_lock_init(&pool->lock); + init_waitqueue_head(&pool->wait); + + pool->bufs = mempool_create(nr_pages, + __mempool_page_alloc, __mempool_page_free, + pool); + if (!pool->bufs) + goto fail_pool; + + kobject_init(&pool->kobj, &blktap_page_pool_ktype); + pool->kobj.kset = pool_set; + err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name); + if (err) + goto fail_bufs; + + return &pool->kobj; + + kobject_del(&pool->kobj); +fail_bufs: + mempool_destroy(pool->bufs); +fail_pool: + kfree(pool); +fail: + return NULL; +} + +struct blktap_page_pool* +blktap_page_pool_get(const char *name) +{ + struct kobject *kobj; + + kobj = __blktap_kset_find_obj(pool_set, name); + if (!kobj) + kobj = blktap_page_pool_create(name, + POOL_DEFAULT_PAGES); + if (!kobj) + return ERR_PTR(-ENOMEM); + + return kobj_to_pool(kobj); +} + +int __init +blktap_page_pool_init(struct kobject *parent) +{ + request_cache = + kmem_cache_create("blktap-request", + sizeof(struct blktap_request), 0, + 0, blktap_request_ctor); + if (!request_cache) + return -ENOMEM; + + request_pool = + mempool_create_slab_pool(POOL_MIN_REQS, request_cache); + if (!request_pool) + return -ENOMEM; + + pool_set = kset_create_and_add("pools", NULL, parent); + if (!pool_set) + return -ENOMEM; + + return 0; +} + +void +blktap_page_pool_exit(void) +{ + if (pool_set) { + BUG_ON(!list_empty(&pool_set->list)); + kset_unregister(pool_set); + pool_set = NULL; + } + + if (request_pool) { + mempool_destroy(request_pool); + request_pool = NULL; + } + + if (request_cache) { + kmem_cache_destroy(request_cache); + request_cache = NULL; + } +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/ring.c 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,550 @@ + +#include +#include +#include +#include +#include + +#include "blktap.h" + +int blktap_ring_major; +static struct cdev blktap_ring_cdev; + + /* + * BLKTAP - immediately before the mmap area, + * we have a bunch of pages reserved for shared memory rings. + */ +#define RING_PAGES 1 + +static void +blktap_ring_read_response(struct blktap *tap, + const struct blkif_response *rsp) +{ + struct blktap_ring *ring = &tap->ring; + struct blktap_request *request; + int usr_idx, err; + + request = NULL; + + usr_idx = rsp->id; + if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) { + err = -ERANGE; + goto invalid; + } + + request = ring->pending[usr_idx]; + + if (!request) { + err = -ESRCH; + goto invalid; + } + + if (rsp->operation != request->operation) { + err = -EINVAL; + goto invalid; + } + + dev_dbg(ring->dev, + "request %d [%p] response: %d\n", + request->usr_idx, request, rsp->status); + + err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO; +end_request: + blktap_device_end_request(tap, request, err); + return; + +invalid: + dev_warn(ring->dev, + "invalid response, idx:%d status:%d op:%d/%d: err %d\n", + usr_idx, rsp->status, + rsp->operation, request->operation, + err); + if (request) + goto end_request; +} + +static void +blktap_read_ring(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct blkif_response rsp; + RING_IDX rc, rp; + + down_read(¤t->mm->mmap_sem); + if (!ring->vma) { + up_read(¤t->mm->mmap_sem); + return; + } + + /* for each outstanding message on the ring */ + rp = ring->ring.sring->rsp_prod; + rmb(); + + for (rc = ring->ring.rsp_cons; rc != rp; rc++) { + memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp)); + blktap_ring_read_response(tap, &rsp); + } + + ring->ring.rsp_cons = rc; + + up_read(¤t->mm->mmap_sem); +} + +static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static void +blktap_ring_fail_pending(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct blktap_request *request; + int usr_idx; + + for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { + request = ring->pending[usr_idx]; + if (!request) + continue; + + blktap_device_end_request(tap, request, -EIO); + } +} + +static void +blktap_ring_vm_close(struct vm_area_struct *vma) +{ + struct blktap *tap = vma->vm_private_data; + struct blktap_ring *ring = &tap->ring; + struct page *page = virt_to_page(ring->ring.sring); + + blktap_ring_fail_pending(tap); + + zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL); + ClearPageReserved(page); + __free_page(page); + + ring->vma = NULL; + + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + blktap_control_destroy_tap(tap); +} + +static struct vm_operations_struct blktap_ring_vm_operations = { + .close = blktap_ring_vm_close, + .fault = blktap_ring_fault, +}; + +int +blktap_ring_map_segment(struct blktap *tap, + struct blktap_request *request, + int seg) +{ + struct blktap_ring *ring = &tap->ring; + unsigned long uaddr; + + uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); + return vm_insert_page(ring->vma, uaddr, request->pages[seg]); +} + +int +blktap_ring_map_request(struct blktap *tap, + struct blktap_request *request) +{ + int seg, err = 0; + int write; + + write = request->operation == BLKIF_OP_WRITE; + + for (seg = 0; seg < request->nr_pages; seg++) { + if (write) + blktap_request_bounce(tap, request, seg, write); + + err = blktap_ring_map_segment(tap, request, seg); + if (err) + break; + } + + if (err) + blktap_ring_unmap_request(tap, request); + + return err; +} + +void +blktap_ring_unmap_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + unsigned long uaddr; + unsigned size; + int seg, read; + + uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0); + size = request->nr_pages << PAGE_SHIFT; + read = request->operation == BLKIF_OP_READ; + + if (read) + for (seg = 0; seg < request->nr_pages; seg++) + blktap_request_bounce(tap, request, seg, !read); + + zap_page_range(ring->vma, uaddr, size, NULL); +} + +void +blktap_ring_free_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + + ring->pending[request->usr_idx] = NULL; + ring->n_pending--; + + blktap_request_free(tap, request); +} + +struct blktap_request* +blktap_ring_make_request(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct blktap_request *request; + int usr_idx; + + if (RING_FULL(&ring->ring)) + return ERR_PTR(-ENOSPC); + + request = blktap_request_alloc(tap); + if (!request) + return ERR_PTR(-ENOMEM); + + for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++) + if (!ring->pending[usr_idx]) + break; + + BUG_ON(usr_idx >= BLK_RING_SIZE); + + request->tap = tap; + request->usr_idx = usr_idx; + + ring->pending[usr_idx] = request; + ring->n_pending++; + + return request; +} + +void +blktap_ring_submit_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + struct blkif_request *breq; + struct scatterlist *sg; + int i, nsecs = 0; + + dev_dbg(ring->dev, + "request %d [%p] submit\n", request->usr_idx, request); + + breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); + + breq->id = request->usr_idx; + breq->sector_number = blk_rq_pos(request->rq); + breq->handle = 0; + breq->operation = request->operation; + breq->nr_segments = request->nr_pages; + + blktap_for_each_sg(sg, request, i) { + struct blkif_request_segment *seg = &breq->seg[i]; + int first, count; + + count = sg->length >> 9; + first = sg->offset >> 9; + + seg->first_sect = first; + seg->last_sect = first + count - 1; + + nsecs += count; + } + + ring->ring.req_prod_pvt++; + + do_gettimeofday(&request->time); + + + if (request->operation == BLKIF_OP_WRITE) { + tap->stats.st_wr_sect += nsecs; + tap->stats.st_wr_req++; + } + + if (request->operation == BLKIF_OP_READ) { + tap->stats.st_rd_sect += nsecs; + tap->stats.st_rd_req++; + } +} + +static int +blktap_ring_open(struct inode *inode, struct file *filp) +{ + struct blktap *tap = NULL; + int minor; + + minor = iminor(inode); + + if (minor < blktap_max_minor) + tap = blktaps[minor]; + + if (!tap) + return -ENXIO; + + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -ENXIO; + + if (tap->ring.task) + return -EBUSY; + + filp->private_data = tap; + tap->ring.task = current; + + return 0; +} + +static int +blktap_ring_release(struct inode *inode, struct file *filp) +{ + struct blktap *tap = filp->private_data; + + blktap_device_destroy_sync(tap); + + tap->ring.task = NULL; + + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + blktap_control_destroy_tap(tap); + + return 0; +} + +static int +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + struct blkif_sring *sring; + struct page *page = NULL; + int err; + + if (ring->vma) + return -EBUSY; + + page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!page) + return -ENOMEM; + + SetPageReserved(page); + + err = vm_insert_page(vma, vma->vm_start, page); + if (err) + goto fail; + + sring = page_address(page); + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); + + ring->ring_vstart = vma->vm_start; + ring->user_vstart = ring->ring_vstart + PAGE_SIZE; + + vma->vm_private_data = tap; + + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_RESERVED; + + vma->vm_ops = &blktap_ring_vm_operations; + + ring->vma = vma; + return 0; + +fail: + if (page) { + zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL); + ClearPageReserved(page); + __free_page(page); + } + + return err; +} + +static int +blktap_ring_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + + BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); + + if (!ring->vma || ring->vma->vm_mm != current->mm) + return -EACCES; + + switch(cmd) { + case BLKTAP2_IOCTL_KICK_FE: + + blktap_read_ring(tap); + return 0; + + case BLKTAP2_IOCTL_CREATE_DEVICE: { + struct blktap_params params; + void __user *ptr = (void *)arg; + + if (!arg) + return -EINVAL; + + if (copy_from_user(¶ms, ptr, sizeof(params))) + return -EFAULT; + + return blktap_device_create(tap, ¶ms); + } + + case BLKTAP2_IOCTL_REMOVE_DEVICE: + + return blktap_device_destroy(tap); + } + + return -ENOIOCTLCMD; +} + +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + int work; + + poll_wait(filp, &tap->pool->wait, wait); + poll_wait(filp, &ring->poll_wait, wait); + + down_read(¤t->mm->mmap_sem); + if (ring->vma && tap->device.gd) + blktap_device_run_queue(tap); + up_read(¤t->mm->mmap_sem); + + work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod; + RING_PUSH_REQUESTS(&ring->ring); + + if (work || + ring->ring.sring->private.tapif_user.msg || + test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse)) + return POLLIN | POLLRDNORM; + + return 0; +} + +static struct file_operations blktap_ring_file_operations = { + .owner = THIS_MODULE, + .open = blktap_ring_open, + .release = blktap_ring_release, + .ioctl = blktap_ring_ioctl, + .mmap = blktap_ring_mmap, + .poll = blktap_ring_poll, +}; + +void +blktap_ring_kick_user(struct blktap *tap) +{ + wake_up(&tap->ring.poll_wait); +} + +int +blktap_ring_destroy(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + + if (ring->task || ring->vma) + return -EBUSY; + + return 0; +} + +int +blktap_ring_create(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + + init_waitqueue_head(&ring->poll_wait); + ring->devno = MKDEV(blktap_ring_major, tap->minor); + + return 0; +} + +size_t +blktap_ring_debug(struct blktap *tap, char *buf, size_t size) +{ + struct blktap_ring *ring = &tap->ring; + char *s = buf, *end = buf + size; + int usr_idx; + + s += snprintf(s, end - s, + "begin pending:%d\n", ring->n_pending); + + for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { + struct blktap_request *request; + struct timeval *time; + int write; + + request = ring->pending[usr_idx]; + if (!request) + continue; + + write = request->operation == BLKIF_OP_WRITE; + time = &request->time; + + s += snprintf(s, end - s, + "%02d: usr_idx:%02d " + "op:%c nr_pages:%02d time:%lu.%09lu\n", + usr_idx, request->usr_idx, + write ? 'W' : 'R', request->nr_pages, + time->tv_sec, time->tv_usec); + } + + s += snprintf(s, end - s, "end pending\n"); + + return s - buf; +} + + +int __init +blktap_ring_init(void) +{ + dev_t dev = 0; + int err; + + cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations); + blktap_ring_cdev.owner = THIS_MODULE; + + err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2"); + if (err < 0) { + BTERR("error registering ring devices: %d\n", err); + return err; + } + + err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE); + if (err) { + BTERR("error adding ring device: %d\n", err); + unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE); + return err; + } + + blktap_ring_major = MAJOR(dev); + BTINFO("blktap ring major: %d\n", blktap_ring_major); + + return 0; +} + +void +blktap_ring_exit(void) +{ + if (!blktap_ring_major) + return; + + cdev_del(&blktap_ring_cdev); + unregister_chrdev_region(MKDEV(blktap_ring_major, 0), + MAX_BLKTAP_DEVICE); + + blktap_ring_major = 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2-new/sysfs.c 2011-02-24 13:49:49.000000000 +0100 @@ -0,0 +1,288 @@ +#include +#include +#include +#include +#include +#include + +#include "blktap.h" + +int blktap_debug_level = 1; + +static struct class *class; + +static ssize_t +blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) +{ + struct blktap *tap; + + tap = dev_get_drvdata(dev); + if (!tap) + return 0; + + if (size >= BLKTAP2_MAX_MESSAGE_LEN) + return -ENAMETOOLONG; + + if (strnlen(buf, size) != size) + return -EINVAL; + + strcpy(tap->name, buf); + + return size; +} + +static ssize_t +blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct blktap *tap; + ssize_t size; + + tap = dev_get_drvdata(dev); + if (!tap) + return 0; + + if (tap->name[0]) + size = sprintf(buf, "%s\n", tap->name); + else + size = sprintf(buf, "%d\n", tap->minor); + + return size; +} +static DEVICE_ATTR(name, S_IRUGO|S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); + +static void +blktap_sysfs_remove_work(struct work_struct *work) +{ + struct blktap *tap + = container_of(work, struct blktap, remove_work); + blktap_control_destroy_tap(tap); +} + +static ssize_t +blktap_sysfs_remove_device(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct blktap *tap; + int err; + + tap = dev_get_drvdata(dev); + if (!tap) + return size; + + if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + goto wait; + + if (tap->ring.vma) { + struct blkif_sring *sring = tap->ring.ring.sring; + sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE; + blktap_ring_kick_user(tap); + } else { + INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work); + schedule_work(&tap->remove_work); + } +wait: + err = wait_event_interruptible(tap->remove_wait, + !dev_get_drvdata(dev)); + if (err) + return err; + + return size; +} +static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); + +static ssize_t +blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct blktap *tap; + char *s = buf, *end = buf + PAGE_SIZE; + + tap = dev_get_drvdata(dev); + if (!tap) + return 0; + + s += blktap_control_debug(tap, s, end - s); + + s += blktap_request_debug(tap, s, end - s); + + s += blktap_device_debug(tap, s, end - s); + + s += blktap_ring_debug(tap, s, end - s); + + return s - buf; +} +static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL); + +static ssize_t +blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct blktap *tap; + ssize_t rv = 0; + + tap = dev_get_drvdata(dev); + if (!tap) + return 0; + + if (tap->ring.task) + rv = sprintf(buf, "%d\n", tap->ring.task->pid); + + return rv; +} +static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL); + +static ssize_t +blktap_sysfs_show_pool(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct blktap *tap = dev_get_drvdata(dev); + return sprintf(buf, "%s", kobject_name(&tap->pool->kobj)); +} + +static ssize_t +blktap_sysfs_store_pool(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct blktap *tap = dev_get_drvdata(dev); + struct blktap_page_pool *pool, *tmp = tap->pool; + + if (tap->device.gd) + return -EBUSY; + + pool = blktap_page_pool_get(buf); + if (IS_ERR(pool)) + return PTR_ERR(pool); + + tap->pool = pool; + kobject_put(&tmp->kobj); + + return size; +} +DEVICE_ATTR(pool, S_IRUSR|S_IWUSR, + blktap_sysfs_show_pool, blktap_sysfs_store_pool); + +int +blktap_sysfs_create(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct device *dev; + int err = 0; + + init_waitqueue_head(&tap->remove_wait); + + dev = device_create(class, NULL, ring->devno, + tap, "blktap%d", tap->minor); + if (IS_ERR(dev)) + err = PTR_ERR(dev); + if (!err) + err = device_create_file(dev, &dev_attr_name); + if (!err) + err = device_create_file(dev, &dev_attr_remove); + if (!err) + err = device_create_file(dev, &dev_attr_debug); + if (!err) + err = device_create_file(dev, &dev_attr_task); + if (!err) + err = device_create_file(dev, &dev_attr_pool); + if (!err) + ring->dev = dev; + else + device_unregister(dev); + + return err; +} + +void +blktap_sysfs_destroy(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct device *dev; + + dev = ring->dev; + + if (!dev) + return; + + dev_set_drvdata(dev, NULL); + wake_up(&tap->remove_wait); + + device_unregister(dev); + ring->dev = NULL; +} + +static ssize_t +blktap_sysfs_show_verbosity(struct class *class, char *buf) +{ + return sprintf(buf, "%d\n", blktap_debug_level); +} + +static ssize_t +blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) +{ + int level; + + if (sscanf(buf, "%d", &level) == 1) { + blktap_debug_level = level; + return size; + } + + return -EINVAL; +} +static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR, + blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); + +static ssize_t +blktap_sysfs_show_devices(struct class *class, char *buf) +{ + int i, ret; + struct blktap *tap; + + mutex_lock(&blktap_lock); + + ret = 0; + for (i = 0; i < blktap_max_minor; i++) { + tap = blktaps[i]; + if (!tap) + continue; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + continue; + + ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name); + } + + mutex_unlock(&blktap_lock); + + return ret; +} +static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL); + +void +blktap_sysfs_exit(void) +{ + if (class) + class_destroy(class); +} + +int __init +blktap_sysfs_init(void) +{ + struct class *cls; + int err = 0; + + cls = class_create(THIS_MODULE, "blktap2"); + if (IS_ERR(cls)) + err = PTR_ERR(cls); + if (!err) + err = class_create_file(cls, &class_attr_verbosity); + if (!err) + err = class_create_file(cls, &class_attr_devices); + if (!err) + class = cls; + else + class_destroy(cls); + + return err; +}