Subject: xen3 xen-drivers From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 1073:8fe973d8fb98) Patch-mainline: n/a Acked-by: jbeulich@novell.com List of files not needed anymore, e.g. because there being a suitable upstream variant (and hence removed from this patch), for reference and in case upstream wants to take the forward porting patches: 2.6.26/drivers/xen/core/features.c 2.6.26/drivers/xen/core/xencomm.c 2.6.31/drivers/xen/evtchn/Makefile 2.6.31/drivers/xen/evtchn/evtchn.c --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/balloon/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,2 @@ + +obj-y := balloon.o sysfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/balloon/balloon.c 2010-03-31 09:56:02.000000000 +0200 @@ -0,0 +1,757 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *balloon_pde; +#endif + +static DEFINE_MUTEX(balloon_mutex); + +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +DEFINE_SPINLOCK(balloon_lock); + +struct balloon_stats balloon_stats; + +/* We increase/decrease in batches which fit in a page */ +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; + +/* VM /proc information for memory */ +extern unsigned long totalram_pages; + +#ifndef MODULE +extern unsigned long totalhigh_pages; +#define inc_totalhigh_pages() (totalhigh_pages++) +#define dec_totalhigh_pages() (totalhigh_pages--) +#else +#define inc_totalhigh_pages() ((void)0) +#define dec_totalhigh_pages() ((void)0) +#endif + +#ifndef CONFIG_XEN +/* + * In HVM guests accounting here uses the Xen visible values, but the kernel + * determined totalram_pages value shouldn't get altered. Since totalram_pages + * includes neither the kernel static image nor any memory allocated prior to + * or from the bootmem allocator, we have to synchronize the two values. + */ +static unsigned long __read_mostly totalram_bias; +#else +#define totalram_bias 0 +#endif + +/* List of ballooned pages, threaded through the mem_map array. */ +static LIST_HEAD(ballooned_pages); + +/* Main work function, always executed in process context. */ +static void balloon_process(void *unused); +static DECLARE_WORK(balloon_worker, balloon_process, NULL); +static struct timer_list balloon_timer; + +/* When ballooning out (allocating memory to return to Xen) we don't really + want the kernel to try too hard since that can trigger the oom killer. */ +#define GFP_BALLOON \ + (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|__GFP_COLD) + +#define PAGE_TO_LIST(p) (&(p)->lru) +#define LIST_TO_PAGE(l) list_entry((l), struct page, lru) +#define UNLIST_PAGE(p) \ + do { \ + list_del(PAGE_TO_LIST(p)); \ + PAGE_TO_LIST(p)->next = NULL; \ + PAGE_TO_LIST(p)->prev = NULL; \ + } while(0) + +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_mem: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_mem: " fmt, ##args) + +/* balloon_append: add the given page to the balloon. */ +static void balloon_append(struct page *page) +{ + /* Lowmem is re-populated first, so highmem pages go at list tail. */ + if (PageHighMem(page)) { + list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); + bs.balloon_high++; + dec_totalhigh_pages(); + } else { + list_add(PAGE_TO_LIST(page), &ballooned_pages); + bs.balloon_low++; + } +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static struct page *balloon_retrieve(void) +{ + struct page *page; + + if (list_empty(&ballooned_pages)) + return NULL; + + page = LIST_TO_PAGE(ballooned_pages.next); + UNLIST_PAGE(page); + + if (PageHighMem(page)) { + bs.balloon_high--; + inc_totalhigh_pages(); + } + else + bs.balloon_low--; + + return page; +} + +static struct page *balloon_first_page(void) +{ + if (list_empty(&ballooned_pages)) + return NULL; + return LIST_TO_PAGE(ballooned_pages.next); +} + +static struct page *balloon_next_page(struct page *page) +{ + struct list_head *next = PAGE_TO_LIST(page)->next; + if (next == &ballooned_pages) + return NULL; + return LIST_TO_PAGE(next); +} + +static inline void balloon_free_page(struct page *page) +{ +#ifndef MODULE + if (put_page_testzero(page)) + free_cold_page(page); +#else + /* free_cold_page() is not being exported. */ + __free_page(page); +#endif +} + +static void balloon_alarm(unsigned long unused) +{ + schedule_work(&balloon_worker); +} + +static unsigned long current_target(void) +{ + unsigned long target = bs.target_pages; + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; + return target; +} + +static unsigned long minimum_target(void) +{ +#ifndef CONFIG_XEN +#define max_pfn num_physpages +#endif + unsigned long min_pages, curr_pages = current_target(); + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* Simple continuous piecewiese linear function: + * max MiB -> min MiB gradient + * 0 0 + * 16 16 + * 32 24 + * 128 72 (1/2) + * 512 168 (1/4) + * 2048 360 (1/8) + * 8192 552 (1/32) + * 32768 1320 + * 131072 4392 + */ + if (max_pfn < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (max_pfn >> 1); + else if (max_pfn < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (max_pfn >> 2); + else if (max_pfn < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (max_pfn >> 3); + else + min_pages = MB2PAGES(296) + (max_pfn >> 5); +#undef MB2PAGES + + /* Don't enforce growth */ + return min(min_pages, curr_pages); +#ifndef CONFIG_XEN +#undef max_pfn +#endif +} + +static int increase_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i, flags; + struct page *page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + balloon_lock(flags); + + page = balloon_first_page(); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page == NULL); + frame_list[i] = page_to_pfn(page);; + page = balloon_next_page(page); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc < 0) + goto out; + + for (i = 0; i < rc; i++) { + page = balloon_retrieve(); + BUG_ON(page == NULL); + + pfn = page_to_pfn(page); + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + + set_phys_to_machine(pfn, frame_list[i]); + +#ifdef CONFIG_XEN + /* Link back into the page tables if not highmem. */ + if (pfn < max_low_pfn) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte_ma(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } +#endif + + /* Relinquish the page back to the allocator. */ + ClearPageReserved(page); + init_page_count(page); + balloon_free_page(page); + } + + bs.current_pages += rc; + totalram_pages = bs.current_pages - totalram_bias; + + out: + balloon_unlock(flags); + + return rc < 0 ? rc : rc != nr_pages; +} + +static int decrease_reservation(unsigned long nr_pages) +{ + unsigned long pfn, i, flags; + struct page *page; + void *v; + int need_sleep = 0; + int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { + if ((page = alloc_page(GFP_BALLOON)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; + } + + pfn = page_to_pfn(page); + frame_list[i] = pfn_to_mfn(pfn); + + if (!PageHighMem(page)) { + v = phys_to_virt(pfn << PAGE_SHIFT); + scrub_pages(v, 1); +#ifdef CONFIG_XEN + ret = HYPERVISOR_update_va_mapping( + (unsigned long)v, __pte_ma(0), 0); + BUG_ON(ret); +#endif + } +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } +#endif + } + +#ifdef CONFIG_XEN + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); +#endif + + balloon_lock(flags); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = mfn_to_pfn(frame_list[i]); + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + balloon_append(pfn_to_page(pfn)); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != nr_pages); + + bs.current_pages -= nr_pages; + totalram_pages = bs.current_pages - totalram_bias; + + balloon_unlock(flags); + + return need_sleep; +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void balloon_process(void *unused) +{ + int need_sleep = 0; + long credit; + + mutex_lock(&balloon_mutex); + + do { + credit = current_target() - bs.current_pages; + if (credit > 0) + need_sleep = (increase_reservation(credit) != 0); + if (credit < 0) + need_sleep = (decrease_reservation(-credit) != 0); + +#ifndef CONFIG_PREEMPT + if (need_resched()) + schedule(); +#endif + } while ((credit != 0) && !need_sleep); + + /* Schedule more work if there is some still to be done. */ + if (current_target() != bs.current_pages) + mod_timer(&balloon_timer, jiffies + HZ); + + mutex_unlock(&balloon_mutex); +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +void balloon_set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + bs.target_pages = max(target, minimum_target()); + schedule_work(&balloon_worker); +} + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +#ifdef CONFIG_PROC_FS +static int balloon_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + + if (copy_from_user(memstring, buffer, count)) + return -EFAULT; + memstring[sizeof(memstring)-1] = '\0'; + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static int balloon_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf( + page, + "Current allocation: %8lu kB\n" + "Requested target: %8lu kB\n" + "Low-mem balloon: %8lu kB\n" + "High-mem balloon: %8lu kB\n" + "Driver pages: %8lu kB\n", + PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), + PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high), + PAGES2KB(bs.driver_pages)); + + + *eof = 1; + return len; +} +#endif + +static struct notifier_block xenstore_notifier; + +static int __init balloon_init(void) +{ +#if !defined(CONFIG_XEN) +# ifndef XENMEM_get_pod_target +# define XENMEM_get_pod_target 17 + typedef struct xen_pod_target { + uint64_t target_pages; + uint64_t tot_pages; + uint64_t pod_cache_pages; + uint64_t pod_entries; + domid_t domid; + } xen_pod_target_t; +# endif + xen_pod_target_t pod_target = { .domid = DOMID_SELF }; + int rc; +#elif defined(CONFIG_X86) + unsigned long pfn; + struct page *page; +#endif + + if (!is_running_on_xen()) + return -ENODEV; + + IPRINTK("Initialising balloon driver.\n"); + +#ifdef CONFIG_XEN + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); + totalram_pages = bs.current_pages; +#else + rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target); + /* + * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus + * converting XENMEM_get_pod_target to XENMEM_decrease_reservation. + * Fortunately this results in a request with all input fields zero, + * but (due to the way bit 4 and upwards get interpreted) a starting + * extent of 1. When start_extent > nr_extents (>= in newer Xen), we + * simply get start_extent returned. + */ + totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1 + ? XENMEM_maximum_reservation : XENMEM_current_reservation, + &pod_target.domid); + if ((long)totalram_bias != -ENOSYS) { + BUG_ON(totalram_bias < totalram_pages); + bs.current_pages = totalram_bias; + totalram_bias -= totalram_pages; + } else { + totalram_bias = 0; + bs.current_pages = totalram_pages; + } +#endif + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + + init_timer(&balloon_timer); + balloon_timer.data = 0; + balloon_timer.function = balloon_alarm; + +#ifdef CONFIG_PROC_FS + if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) { + WPRINTK("Unable to create /proc/xen/balloon.\n"); + return -1; + } + + balloon_pde->read_proc = balloon_read; + balloon_pde->write_proc = balloon_write; +#endif + balloon_sysfs_init(); + +#if defined(CONFIG_X86) && defined(CONFIG_XEN) + /* Initialise the balloon with excess memory space. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + page = pfn_to_page(pfn); + if (!PageReserved(page)) + balloon_append(page); + } +#endif + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(balloon_init); + +static void __exit balloon_exit(void) +{ + balloon_sysfs_exit(); + /* XXX - release balloon here */ +} + +module_exit(balloon_exit); + +void balloon_update_driver_allowance(long delta) +{ + unsigned long flags; + + balloon_lock(flags); + bs.driver_pages += delta; + balloon_unlock(flags); +} + +#ifdef CONFIG_XEN +static int dealloc_pte_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + unsigned long mfn = pte_mfn(*pte); + int ret; + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &mfn); + set_pte_at(&init_mm, addr, pte, __pte_ma(0)); + set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != 1); + return 0; +} +#endif + +struct page **alloc_empty_pages_and_pagevec(int nr_pages) +{ + unsigned long flags; + void *v; + struct page *page, **pagevec; + int i, ret; + + pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); + if (pagevec == NULL) + return NULL; + + for (i = 0; i < nr_pages; i++) { + balloon_lock(flags); + page = balloon_first_page(); + if (page && !PageHighMem(page)) { + UNLIST_PAGE(page); + bs.balloon_low--; + balloon_unlock(flags); + pagevec[i] = page; + continue; + } + balloon_unlock(flags); + + page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_COLD); + if (page == NULL) + goto err; + + v = page_address(page); + scrub_pages(v, 1); + + balloon_lock(flags); + + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long gmfn = page_to_pfn(page); + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &gmfn); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + if (ret == 1) + ret = 0; /* success */ + } else { +#ifdef CONFIG_XEN + ret = apply_to_page_range(&init_mm, (unsigned long)v, + PAGE_SIZE, dealloc_pte_fn, + NULL); +#else + /* Cannot handle non-auto translate mode. */ + ret = 1; +#endif + } + + if (ret != 0) { + balloon_unlock(flags); + balloon_free_page(page); + goto err; + } + + totalram_pages = --bs.current_pages - totalram_bias; + + balloon_unlock(flags); + } + + out: + schedule_work(&balloon_worker); +#ifdef CONFIG_XEN + flush_tlb_all(); +#endif + return pagevec; + + err: + balloon_lock(flags); + while (--i >= 0) + balloon_append(pagevec[i]); + balloon_unlock(flags); + kfree(pagevec); + pagevec = NULL; + goto out; +} + +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) +{ + unsigned long flags; + int i; + + if (pagevec == NULL) + return; + + balloon_lock(flags); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page_count(pagevec[i]) != 1); + balloon_append(pagevec[i]); + } + balloon_unlock(flags); + + kfree(pagevec); + + schedule_work(&balloon_worker); +} + +void balloon_release_driver_page(struct page *page) +{ + unsigned long flags; + + balloon_lock(flags); + balloon_append(page); + bs.driver_pages--; + balloon_unlock(flags); + + schedule_work(&balloon_worker); +} + +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(balloon_release_driver_page); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/balloon/common.h 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,56 @@ +/****************************************************************************** + * balloon/common.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BALLOON_COMMON_H__ +#define __XEN_BALLOON_COMMON_H__ + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; + +extern struct balloon_stats balloon_stats; +#define bs balloon_stats + +int balloon_sysfs_init(void); +void balloon_sysfs_exit(void); + +void balloon_set_new_target(unsigned long target); + +#endif /* __XEN_BALLOON_COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/balloon/sysfs.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,167 @@ +/****************************************************************************** + * balloon/sysfs.c + * + * Xen balloon driver - sysfs interfaces. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#define BALLOON_CLASS_NAME "xen_memory" + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high)); +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages)); + +static ssize_t show_target_kb(struct sys_device *dev, char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + const char *buf, + size_t count) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + strcpy(memstring, buf); + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + &attr_driver_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs, +}; + +static struct sysdev_class balloon_sysdev_class = { + set_kset_name(BALLOON_CLASS_NAME), +}; + +static struct sys_device balloon_sysdev; + +static int __init register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +static __exit void unregister_balloon(struct sys_device *sysdev) +{ + int i; + + sysfs_remove_group(&sysdev->kobj, &balloon_info_group); + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); +} + +int __init balloon_sysfs_init(void) +{ + return register_balloon(&balloon_sysdev); +} + +void __exit balloon_sysfs_exit(void) +{ + unregister_balloon(&balloon_sysdev); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/Makefile 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,4 @@ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o +obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o + +blkbk-y := blkback.o xenbus.o interface.o vbd.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/blkback.c 2010-09-23 15:39:04.000000000 +0200 @@ -0,0 +1,672 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/blkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** + * + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +static int blkif_reqs = 64; +module_param_named(reqs, blkif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); + +/* Run-time switchable: /sys/module/blkback/parameters/ */ +static unsigned int log_stats = 0; +static unsigned int debug_lvl = 0; +module_param(log_stats, int, 0644); +module_param(debug_lvl, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + u64 id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; +} pending_req_t; + +static pending_req_t *pending_reqs; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); + +#define BLKBACK_INVALID_HANDLE (~0) + +static struct page **pending_pages; +static grant_handle_t *pending_grant_handles; + +static inline int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] + +static inline unsigned long vaddr(pending_req_t *req, int seg) +{ + unsigned long pfn = page_to_pfn(pending_page(req, seg)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + + +static int do_block_io_op(blkif_t *blkif); +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req); +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st); + +/****************************************************************** + * misc small helpers + */ +static pending_req_t* alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&pending_free_lock, flags); + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + spin_unlock_irqrestore(&pending_free_lock, flags); + if (was_empty) + wake_up(&pending_free_wq); +} + +static void unplug_queue(blkif_t *blkif) +{ + if (blkif->plug == NULL) + return; + if (blkif->plug->unplug_fn) + blkif->plug->unplug_fn(blkif->plug); + blk_put_queue(blkif->plug); + blkif->plug = NULL; +} + +static void plug_queue(blkif_t *blkif, struct block_device *bdev) +{ + request_queue_t *q = bdev_get_queue(bdev); + + if (q == blkif->plug) + return; + unplug_queue(blkif); + blk_get_queue(q); + blkif->plug = q; +} + +static void fast_flush_area(pending_req_t *req) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int ret; + + for (i = 0; i < req->nr_pages; i++) { + handle = pending_handle(req, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + blkback_pagemap_clear(pending_page(req, i)); + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); +} + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void print_stats(blkif_t *blkif) +{ + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int blkif_schedule(void *arg) +{ + blkif_t *blkif = arg; + struct vbd *vbd = &blkif->vbd; + + blkif_get(blkif); + + if (debug_lvl) + printk(KERN_DEBUG "%s: started\n", current->comm); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + if (unlikely(vbd->size != vbd_size(vbd))) + vbd_resize(blkif); + + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + unplug_queue(blkif); + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + if (debug_lvl) + printk(KERN_DEBUG "%s: exiting\n", current->comm); + + blkif->xenblkd = NULL; + blkif_put(blkif); + + return 0; +} + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void __end_block_io_op(pending_req_t *pending_req, int error) +{ + /* An error fails the entire request. */ + if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + DPRINTK("blkback: write barrier op failed, not supported\n"); + blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + DPRINTK("Buffer not up-to-date at end of operation, " + "error=%d\n", error); + pending_req->status = BLKIF_RSP_ERROR; + } + + if (atomic_dec_and_test(&pending_req->pendcnt)) { + fast_flush_area(pending_req); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } +} + +static int end_block_io_op(struct bio *bio, unsigned int done, int error) +{ + if (bio->bi_size != 0) + return 1; + __end_block_io_op(bio->bi_private, error); + bio_put(bio); + return error; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +static void blkif_notify_work(blkif_t *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif) +{ + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + blkif_request_t req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + while ((rc != rp)) { + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) + break; + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + + switch (req.operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + case BLKIF_OP_WRITE_BARRIER: + blkif->st_br_req++; + /* fall through */ + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + default: + /* A good sign something is wrong: sleep for a while to + * avoid excessive CPU consumption by a bad guest. */ + msleep(1); + DPRINTK("error: unknown block io operation [%d]\n", + req.operation); + make_response(blkif, req.id, req.operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct phys_req preq; + struct { + unsigned long buf; unsigned int nsec; + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg; + struct bio *bio = NULL; + int ret, i; + int operation; + + switch (req->operation) { + case BLKIF_OP_READ: + operation = READ; + break; + case BLKIF_OP_WRITE: + operation = WRITE; + break; + case BLKIF_OP_WRITE_BARRIER: + operation = WRITE_BARRIER; + break; + default: + operation = 0; /* make gcc happy */ + BUG(); + } + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if (unlikely(nseg == 0 && operation != WRITE_BARRIER) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTK("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + preq.dev = req->handle; + preq.sector_number = req->sector_number; + preq.nr_sects = 0; + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + uint32_t flags; + + seg[i].nsec = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (req->seg[i].last_sect < req->seg[i].first_sect)) + goto fail_response; + preq.nr_sects += seg[i].nsec; + + flags = GNTMAP_host_map; + if (operation != READ) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, + req->seg[i].gref, blkif->domid); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + BUG_ON(ret); + + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status == GNTST_eagain)) + gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map[i]) + if (unlikely(map[i].status != GNTST_okay)) { + DPRINTK("invalid buffer -- could not remap it\n"); + map[i].handle = BLKBACK_INVALID_HANDLE; + ret = 1; + } else { + blkback_pagemap_set(vaddr_pagenr(pending_req, i), + pending_page(pending_req, i), + blkif->domid, req->handle, + req->seg[i].gref); + } + + pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + + set_phys_to_machine( + page_to_pfn(pending_page(pending_req, i)), + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); + seg[i].buf = map[i].dev_bus_addr | + (req->seg[i].first_sect << 9); + } + + if (ret) + goto fail_flush; + + if (vbd_translate(&preq, blkif, operation) != 0) { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + preq.sector_number, + preq.sector_number + preq.nr_sects, preq.dev); + goto fail_flush; + } + + plug_queue(blkif, preq.bdev); + atomic_set(&pending_req->pendcnt, 1); + blkif_get(blkif); + + for (i = 0; i < nseg; i++) { + if (((int)preq.sector_number|(int)seg[i].nsec) & + ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { + DPRINTK("Misaligned I/O request from domain %d", + blkif->domid); + goto fail_put_bio; + } + + while ((bio == NULL) || + (bio_add_page(bio, + pending_page(pending_req, i), + seg[i].nsec << 9, + seg[i].buf & ~PAGE_MASK) == 0)) { + if (bio) { + atomic_inc(&pending_req->pendcnt); + submit_bio(operation, bio); + } + + bio = bio_alloc(GFP_KERNEL, nseg-i); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = preq.sector_number; + } + + preq.sector_number += seg[i].nsec; + } + + if (!bio) { + BUG_ON(operation != WRITE_BARRIER); + bio = bio_alloc(GFP_KERNEL, 0); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = -1; + } + + submit_bio(operation, bio); + + if (operation == READ) + blkif->st_rd_sect += preq.nr_sects; + else if (operation == WRITE || operation == WRITE_BARRIER) + blkif->st_wr_sect += preq.nr_sects; + + return; + + fail_flush: + fast_flush_area(pending_req); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); + msleep(1); /* back off a bit */ + return; + + fail_put_bio: + __end_block_io_op(pending_req, -EINVAL); + if (bio) + bio_put(bio); + unplug_queue(blkif); + msleep(1); /* back off a bit */ + return; +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st) +{ + blkif_response_t resp; + unsigned long flags; + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + int more_to_do = 0; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + + } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init blkif_init(void) +{ + int i, mmap_pages; + + if (!is_running_on_xen()) + return -ENODEV; + + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; + + pending_reqs = kmalloc(sizeof(pending_reqs[0]) * + blkif_reqs, GFP_KERNEL); + pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * + mmap_pages, GFP_KERNEL); + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); + + if (blkback_pagemap_init(mmap_pages)) + goto out_of_memory; + + if (!pending_reqs || !pending_grant_handles || !pending_pages) + goto out_of_memory; + + for (i = 0; i < mmap_pages; i++) + pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + + blkif_interface_init(); + + memset(pending_reqs, 0, sizeof(pending_reqs)); + INIT_LIST_HEAD(&pending_free); + + for (i = 0; i < blkif_reqs; i++) + list_add_tail(&pending_reqs[i].free_list, &pending_free); + + blkif_xenbus_init(); + + return 0; + + out_of_memory: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; +} + +module_init(blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/blkback-pagemap.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,96 @@ +#include +#include "blkback-pagemap.h" + +static int blkback_pagemap_size; +static struct blkback_pagemap *blkback_pagemap; + +static inline int +blkback_pagemap_entry_clear(struct blkback_pagemap *map) +{ + static struct blkback_pagemap zero; + return !memcmp(map, &zero, sizeof(zero)); +} + +int +blkback_pagemap_init(int pages) +{ + blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap), + GFP_KERNEL); + if (!blkback_pagemap) + return -ENOMEM; + + blkback_pagemap_size = pages; + return 0; +} +EXPORT_SYMBOL_GPL(blkback_pagemap_init); + +void +blkback_pagemap_set(int idx, struct page *page, + domid_t domid, busid_t busid, grant_ref_t gref) +{ + struct blkback_pagemap *entry; + + BUG_ON(!blkback_pagemap); + BUG_ON(idx >= blkback_pagemap_size); + + SetPageBlkback(page); + set_page_private(page, idx); + + entry = blkback_pagemap + idx; + if (!blkback_pagemap_entry_clear(entry)) { + printk("overwriting pagemap %d: d %u b %u g %u\n", + idx, entry->domid, entry->busid, entry->gref); + BUG(); + } + + entry->domid = domid; + entry->busid = busid; + entry->gref = gref; +} +EXPORT_SYMBOL_GPL(blkback_pagemap_set); + +void +blkback_pagemap_clear(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("clearing empty pagemap %d\n", idx); + BUG(); + } + + memset(entry, 0, sizeof(*entry)); +} +EXPORT_SYMBOL_GPL(blkback_pagemap_clear); + +struct blkback_pagemap +blkback_pagemap_read(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("reading empty pagemap %d\n", idx); + BUG(); + } + + return *entry; +} +EXPORT_SYMBOL(blkback_pagemap_read); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/blkback-pagemap.h 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,37 @@ +#ifndef _BLKBACK_PAGEMAP_H_ +#define _BLKBACK_PAGEMAP_H_ + +#include +#include +#include + +typedef unsigned int busid_t; + +struct blkback_pagemap { + domid_t domid; + busid_t busid; + grant_ref_t gref; +}; + +#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE) + +int blkback_pagemap_init(int); +void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t); +void blkback_pagemap_clear(struct page *); +struct blkback_pagemap blkback_pagemap_read(struct page *); + +#else /* CONFIG_XEN_BLKBACK_PAGEMAP */ + +static inline int blkback_pagemap_init(int pages) { return 0; } +static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom, + busid_t bus, grant_ref_t gnt) {} +static inline void blkback_pagemap_clear(struct page *page) {} +static inline struct blkback_pagemap blkback_pagemap_read(struct page *page) +{ + BUG(); + return (struct blkback_pagemap){-1, -1, -1}; +} + +#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */ + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/common.h 2010-09-23 15:39:04.000000000 +0200 @@ -0,0 +1,153 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blkback-pagemap.h" + + +#define DPRINTK(_f, _a...) \ + pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +struct vbd { + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + u32 pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; + sector_t size; /* Cached size parameter */ +}; + +struct backend_info; + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + blkif_back_rings_t blk_rings; + struct vm_struct *blk_ring_area; + /* The VBD attached to this interface. */ + struct vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + struct task_struct *xenblkd; + unsigned int waiting_reqs; + request_queue_t *plug; + + /* statistics */ + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_br_req; + int st_rd_sect; + int st_wr_sect; + + wait_queue_head_t waiting_to_free; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; +} blkif_t; + +struct backend_info +{ + struct xenbus_device *dev; + blkif_t *blkif; + struct xenbus_watch backend_watch; + unsigned major; + unsigned minor; + char *mode; +}; + +blkif_t *blkif_alloc(domid_t domid); +void blkif_disconnect(blkif_t *blkif); +void blkif_free(blkif_t *blkif); +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); +void vbd_resize(blkif_t *blkif); + +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + +/* Create a vbd. */ +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, + unsigned minor, int readonly, int cdrom); +void vbd_free(struct vbd *vbd); + +unsigned long long vbd_size(struct vbd *vbd); +unsigned int vbd_info(struct vbd *vbd); +unsigned long vbd_secsize(struct vbd *vbd); + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; + +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); + +void blkif_interface_init(void); + +void blkif_xenbus_init(void); + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +int blkif_schedule(void *arg); + +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/interface.c 2010-09-23 15:39:04.000000000 +0200 @@ -0,0 +1,183 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" +#include +#include +#include + +static kmem_cache_t *blkif_cachep; + +blkif_t *blkif_alloc(domid_t domid) +{ + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + blkif->st_print = jiffies; + init_waitqueue_head(&blkif->waiting_to_free); + + return blkif; +} + +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + int ret; + + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, shared_page, blkif->domid); + + gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op); + + if (op.status == GNTST_okay) { + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + ret = 0; + } else { + DPRINTK(" Grant table operation failure %d!\n", (int)op.status); + ret = -EINVAL; + } + + return ret; +} + +static void unmap_frontend_page(blkif_t *blkif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, blkif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler( + blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif); + if (err < 0) + { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +void blkif_disconnect(blkif_t *blkif) +{ + if (blkif->xenblkd) { + kthread_stop(blkif->xenblkd); + blkif->xenblkd = NULL; + } + + atomic_dec(&blkif->refcnt); + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); + atomic_inc(&blkif->refcnt); + + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + + if (blkif->blk_rings.common.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + } +} + +void blkif_free(blkif_t *blkif) +{ + if (!atomic_dec_and_test(&blkif->refcnt)) + BUG(); + kmem_cache_free(blkif_cachep, blkif); +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/vbd.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,161 @@ +/****************************************************************************** + * blkback/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "common.h" + +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ + (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) + +unsigned long long vbd_size(struct vbd *vbd) +{ + return vbd_sz(vbd); +} + +unsigned int vbd_info(struct vbd *vbd) +{ + return vbd->type | (vbd->readonly?VDISK_READONLY:0); +} + +unsigned long vbd_secsize(struct vbd *vbd) +{ + return bdev_hardsect_size(vbd->bdev); +} + +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, + unsigned minor, int readonly, int cdrom) +{ + struct vbd *vbd; + struct block_device *bdev; + + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; + vbd->type = 0; + + vbd->pdevice = MKDEV(major, minor); + + bdev = open_by_devnum(vbd->pdevice, + vbd->readonly ? FMODE_READ : FMODE_WRITE); + + if (IS_ERR(bdev)) { + DPRINTK("vbd_creat: device %08x could not be opened.\n", + vbd->pdevice); + return -ENOENT; + } + + vbd->bdev = bdev; + vbd->size = vbd_size(vbd); + + if (vbd->bdev->bd_disk == NULL) { + DPRINTK("vbd_creat: device %08x doesn't exist.\n", + vbd->pdevice); + vbd_free(vbd); + return -ENOENT; + } + + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + vbd->type |= VDISK_CDROM; + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + vbd->type |= VDISK_REMOVABLE; + + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; +} + +void vbd_free(struct vbd *vbd) +{ + if (vbd->bdev) + blkdev_put(vbd->bdev); + vbd->bdev = NULL; +} + +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) +{ + struct vbd *vbd = &blkif->vbd; + int rc = -EACCES; + + if ((operation != READ) && vbd->readonly) + goto out; + + if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) + goto out; + + req->dev = vbd->pdevice; + req->bdev = vbd->bdev; + rc = 0; + + out: + return rc; +} + +void vbd_resize(blkif_t *blkif) +{ + struct vbd *vbd = &blkif->vbd; + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = blkif->be->dev; + unsigned long long new_size = vbd_size(vbd); + + printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size); + vbd->size = new_size; +again: + err = xenbus_transaction_start(&xbt); + if (err) { + printk(KERN_WARNING "Error starting transaction"); + return; + } + err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu", + vbd_size(vbd)); + if (err) { + printk(KERN_WARNING "Error writing new size"); + goto abort; + } + /* + * Write the current state; we will use this to synchronize + * the front-end. If the current state is "connected" the + * front-end will get the new size information online. + */ + err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); + if (err) { + printk(KERN_WARNING "Error writing the state"); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + printk(KERN_WARNING "Error ending transaction"); +abort: + xenbus_transaction_end(xbt, 1); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkback/xenbus.c 2010-11-25 09:36:37.000000000 +0100 @@ -0,0 +1,557 @@ +/* Xenbus code for blkif backend + Copyright (C) 2005 Rusty Russell + Copyright (C) 2005 XenSource Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include "common.h" + +#undef DPRINTK +#define DPRINTK(fmt, args...) \ + pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +static DEFINE_RWLOCK(sysfs_read_lock); + +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static void backend_changed(struct xenbus_watch *, const char **, + unsigned int); + +static int blkback_name(blkif_t *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + if ((devname = strstr(devpath, "/dev/")) != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +static void update_blkif_status(blkif_t *blkif) +{ + int err; + char name[TASK_COMM_LEN]; + + /* Not ready to connect? */ + if (!blkif->irq || !blkif->vbd.bdev) + return; + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blkback_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); + return; + } + + err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "block flush"); + return; + } + invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); + + blkif->xenblkd = kthread_run(blkif_schedule, blkif, name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); + } +} + + +/**************************************************************** + * sysfs interface for VBD I/O requests + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + ssize_t ret = -ENODEV; \ + struct xenbus_device *dev; \ + struct backend_info *be; \ + \ + if (!get_device(_dev)) \ + return ret; \ + dev = to_xenbus_device(_dev); \ + read_lock(&sysfs_read_lock); \ + if ((be = dev->dev.driver_data) != NULL) \ + ret = sprintf(buf, format, ##args); \ + read_unlock(&sysfs_read_lock); \ + put_device(_dev); \ + return ret; \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); +VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); + +static struct attribute *vbdstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_br_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group vbdstat_group = { + .name = "statistics", + .attrs = vbdstat_attrs, +}; + +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); +VBD_SHOW(mode, "%s\n", be->mode); + +int xenvbd_sysfs_addif(struct xenbus_device *dev) +{ + int error; + + error = device_create_file(&dev->dev, &dev_attr_physical_device); + if (error) + goto fail1; + + error = device_create_file(&dev->dev, &dev_attr_mode); + if (error) + goto fail2; + + error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group); + if (error) + goto fail3; + + return 0; + +fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); +fail2: device_remove_file(&dev->dev, &dev_attr_mode); +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); + return error; +} + +void xenvbd_sysfs_delif(struct xenbus_device *dev) +{ + sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); + device_remove_file(&dev->dev, &dev_attr_mode); + device_remove_file(&dev->dev, &dev_attr_physical_device); +} + +static int blkback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + DPRINTK(""); + + write_lock(&sysfs_read_lock); + if (be->major || be->minor) + xenvbd_sysfs_delif(dev); + + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + + if (be->blkif) { + blkif_disconnect(be->blkif); + vbd_free(&be->blkif->vbd); + blkif_free(be->blkif); + be->blkif = NULL; + } + + kfree(be); + dev->dev.driver_data = NULL; + write_unlock(&sysfs_read_lock); + return 0; +} + +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-barrier"); + + return err; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures, and watch the store waiting for the hotplug scripts to tell us + * the device's physical major and minor numbers. Switch to InitWait. + */ +static int blkback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + be->dev = dev; + dev->dev.driver_data = be; + + be->blkif = blkif_alloc(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + + err = xenbus_watch_path2(dev, dev->nodename, "physical-device", + &be->backend_watch, backend_changed); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + DPRINTK("failed"); + blkback_remove(dev); + return err; +} + + +/** + * Callback received when the hotplug scripts have placed the physical-device + * node. Read it and the mode node, and create a vbd. If the frontend is + * ready, connect. + */ +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned major; + unsigned minor; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + int cdrom = 0; + char *device_type; + + DPRINTK(""); + + err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", + &major, &minor); + if (XENBUS_EXIST_ERR(err)) { + /* Since this watch will fire once immediately after it is + registered, we expect this. Ignore it, and wait for the + hotplug scripts. */ + return; + } + if (err != 2) { + xenbus_dev_fatal(dev, err, "reading physical-device"); + return; + } + + if ((be->major || be->minor) && + ((be->major != major) || (be->minor != minor))) { + printk(KERN_WARNING + "blkback: changing physical device (from %x:%x to " + "%x:%x) not supported.\n", be->major, be->minor, + major, minor); + return; + } + + be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); + if (IS_ERR(be->mode)) { + err = PTR_ERR(be->mode); + be->mode = NULL; + xenbus_dev_fatal(dev, err, "reading mode"); + return; + } + + device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); + if (!IS_ERR(device_type)) { + cdrom = strcmp(device_type, "cdrom") == 0; + kfree(device_type); + } + + if (be->major == 0 && be->minor == 0) { + /* Front end dir is a number, which is used as the handle. */ + + char *p = strrchr(dev->otherend, '/') + 1; + long handle = simple_strtoul(p, NULL, 0); + + be->major = major; + be->minor = minor; + + err = vbd_create(be->blkif, handle, major, minor, + (NULL == strchr(be->mode, 'w')), cdrom); + if (err) { + be->major = be->minor = 0; + xenbus_dev_fatal(dev, err, "creating vbd structure"); + return; + } + + err = xenvbd_sysfs_addif(dev); + if (err) { + vbd_free(&be->blkif->vbd); + be->major = be->minor = 0; + xenbus_dev_fatal(dev, err, "creating sysfs entries"); + return; + } + + /* We're potentially connected now */ + update_blkif_status(be->blkif); + } +} + + +/** + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + DPRINTK("%s", xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk(KERN_INFO "%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* Ensure we connect even when two watches fire in + close successsion and we miss the intermediate value + of frontend_state. */ + if (dev->state == XenbusStateConnected) + break; + + /* Enforce precondition before potential leak point. + * blkif_disconnect() is idempotent. + */ + blkif_disconnect(be->blkif); + + err = connect_ring(be); + if (err) + break; + update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + blkif_disconnect(be->blkif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + /* implies blkif_disconnect() via blkback_remove() */ + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/* ** Connection ** */ + + +/** + * Write the physical details regarding the block device to the store, and + * switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = be->dev; + + DPRINTK("%s", dev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + return; + } + + err = blkback_barrier(xbt, be, 1); + if (err) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + vbd_size(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sectors", + dev->nodename); + goto abort; + } + + /* FIXME: use a typename instead */ + err = xenbus_printf(xbt, dev->nodename, "info", "%u", + vbd_info(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/info", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", + vbd_secsize(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sector-size", + dev->nodename); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(dev, err, "ending transaction"); + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "switching to Connected state", + dev->nodename); + + return; + abort: + xenbus_transaction_end(xbt, 1); +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + char protocol[64] = ""; + int err; + + DPRINTK("%s", dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming native"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + printk(KERN_INFO + "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); + + /* Map the shared frame, irq etc. */ + err = blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + + +static struct xenbus_driver blkback = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, + .otherend_changed = frontend_changed +}; + + +void blkif_xenbus_init(void) +{ + xenbus_register_backend(&blkback); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkfront/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,5 @@ + +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o + +xenblk-objs := blkfront.o vbd.o + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkfront/blkfront.c 2010-09-23 15:39:04.000000000 +0200 @@ -0,0 +1,967 @@ +/****************************************************************************** + * blkfront.c + * + * XenLinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * Copyright (c) 2004, Andrew Warfield + * Copyright (c) 2005, Christopher Clark + * Copyright (c) 2005, XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "block.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#define BLKIF_STATE_DISCONNECTED 0 +#define BLKIF_STATE_CONNECTED 1 +#define BLKIF_STATE_SUSPENDED 2 + +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) +#define GRANT_INVALID_REF 0 + +static void connect(struct blkfront_info *); +static void blkfront_closing(struct blkfront_info *); +static int blkfront_remove(struct xenbus_device *); +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *); +static int setup_blkring(struct xenbus_device *, struct blkfront_info *); + +static void kick_pending_request_queues(struct blkfront_info *); + +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs); +static void blkif_restart_queue(void *arg); +static void blkif_recover(struct blkfront_info *); +static void blkif_completion(struct blk_shadow *); +static void blkif_free(struct blkfront_info *, int); + + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffer for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Initialised state. + */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, vdevice, i; + struct blkfront_info *info; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(XBT_NIL, dev->nodename, + "virtual-device", "%i", &vdevice); + if (err != 1) { + /* go looking in the extended area instead */ + err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", + "%i", &vdevice); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading virtual-device"); + return err; + } + } + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + INIT_WORK(&info->work, blkif_restart_queue, (void *)info); + + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); + dev->dev.driver_data = info; + + err = talk_to_backend(dev, info); + if (err) { + kfree(info); + dev->dev.driver_data = NULL; + return err; + } + + return 0; +} + + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our blkif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + int err; + + DPRINTK("blkfront_resume: %s\n", dev->nodename); + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + + err = talk_to_backend(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + blkif_recover(info); + + return err; +} + + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", + XEN_IO_PROTO_ABI_NATIVE); + if (err) { + message = "writing protocol"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + + +static int setup_blkring(struct xenbus_device *dev, + struct blkfront_info *info) +{ + blkif_sring_t *sring; + int err; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + memset(info->sg, 0, sizeof(info->sg)); + + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); + if (err < 0) { + free_page((unsigned long)sring); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref = err; + + err = bind_listening_port_to_irqhandler( + dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_listening_port_to_irqhandler"); + goto fail; + } + info->irq = err; + + return 0; +fail: + blkif_free(info, 0); + return err; +} + + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct blkfront_info *info = dev->dev.driver_data; + struct block_device *bd; + + DPRINTK("blkfront:backend_changed.\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + + case XenbusStateConnected: + connect(info); + break; + + case XenbusStateClosing: + bd = bdget(info->dev); + if (bd == NULL) { + xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); + break; + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) + down(&bd->bd_sem); +#else + mutex_lock(&bd->bd_mutex); +#endif + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(info); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) + up(&bd->bd_sem); +#else + mutex_unlock(&bd->bd_mutex); +#endif + bdput(bd); + break; + } +} + + +/* ** Connection ** */ + + +/* + * Invoked when the backend is finally 'ready' (and has told produced + * the details about the physical device - #sectors, size, etc). + */ +static void connect(struct blkfront_info *info) +{ + unsigned long long sectors; + unsigned long sector_size; + unsigned int binfo; + int err; + + switch (info->connected) { + case BLKIF_STATE_CONNECTED: + /* + * Potentially, the back-end may be signalling + * a capacity change; update the capacity. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "sectors", "%Lu", §ors); + if (XENBUS_EXIST_ERR(err)) + return; + printk(KERN_INFO "Setting capacity to %Lu\n", + sectors); + set_capacity(info->gd, sectors); + + /* fall through */ + case BLKIF_STATE_SUSPENDED: + return; + } + + DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "sectors", "%Lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(info->xbdev, err, + "reading backend fields at %s", + info->xbdev->otherend); + return; + } + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_sysfs_addif(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s", + info->xbdev->otherend); + return; + } + + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ + spin_lock_irq(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); + + add_disk(info->gd); + + info->is_ready = 1; +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +static void blkfront_closing(struct blkfront_info *info) +{ + unsigned long flags; + + DPRINTK("blkfront_closing: %d removed\n", info->vdevice); + + if (info->rq == NULL) + goto out; + + spin_lock_irqsave(&blkif_io_lock, flags); + /* No more blkif_request(). */ + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irqrestore(&blkif_io_lock, flags); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + xlvbd_sysfs_delif(info); + + xlvbd_del(info); + + out: + if (info->xbdev) + xenbus_frontend_closed(info->xbdev); +} + + +static int blkfront_remove(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + + DPRINTK("blkfront_remove: %s removed\n", dev->nodename); + + blkif_free(info, 0); + + if(info->users == 0) + kfree(info); + else + info->xbdev = NULL; + + return 0; +} + + +static inline int GET_ID_FROM_FREELIST( + struct blkfront_info *info) +{ + unsigned long free = info->shadow_free; + BUG_ON(free >= BLK_RING_SIZE); + info->shadow_free = info->shadow[free].req.id; + info->shadow[free].req.id = 0x0fffffee; /* debug */ + return free; +} + +static inline void ADD_ID_TO_FREELIST( + struct blkfront_info *info, unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].request = 0; + info->shadow_free = id; +} + +static inline void flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static void kick_pending_request_queues(struct blkfront_info *info) +{ + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +} + +static void blkif_restart_queue(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + spin_lock_irq(&blkif_io_lock); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); +} + +static void blkif_restart_queue_callback(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + schedule_work(&info->work); +} + +int blkif_open(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + + if (!info->xbdev) + return -ENODEV; + info->users++; + return 0; +} + + +int blkif_release(struct inode *inode, struct file *filep) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + info->users--; + if (info->users == 0) { + /* Check whether we have been instructed to close. We will + have ignored this request initially, as the device was + still mounted. */ + struct xenbus_device * dev = info->xbdev; + + if (!dev) { + blkfront_closing(info); + kfree(info); + } else if (xenbus_read_driver_state(dev->otherend) + == XenbusStateClosing && info->is_ready) + blkfront_closing(info); + } + return 0; +} + + +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; + int i; + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blkif_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif + case CDROMMULTISESSION: + DPRINTK("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: { + struct gendisk *gd = info->gd; + if (gd->flags & GENHD_FL_CD) + return 0; + return -EINVAL; + } + default: + if (info->mi && info->gd) { + switch (info->mi->major) { + case SCSI_DISK0_MAJOR: + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: + case SCSI_CDROM_MAJOR: + return scsi_cmd_ioctl(filep, info->gd, command, + (void __user *)argument); + } + } + + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + + +int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + unsigned long buffer_mfn; + blkif_request_t *ring_req; + unsigned long id; + unsigned int fsect, lsect; + int i, ref; + grant_ref_t gref_head; + struct scatterlist *sg; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = GET_ID_FROM_FREELIST(info); + info->shadow[id].request = (unsigned long)req; + + ring_req->id = id; + ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->handle = info->handle; + + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (blk_barrier_rq(req)) + ring_req->operation = BLKIF_OP_WRITE_BARRIER; + + ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); + BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + for (i = 0; i < ring_req->nr_segments; ++i) { + sg = info->sg + i; + buffer_mfn = page_to_phys(sg->page) >> PAGE_SHIFT; + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req) ? GTF_readonly : 0 ); + + info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); + ring_req->seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + gnttab_free_grant_references(gref_head); + + return 0; +} + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +void do_blkif_request(request_queue_t *rq) +{ + struct blkfront_info *info = NULL; + struct request *req; + int queued; + + DPRINTK("Entered do_blkif_request\n"); + + queued = 0; + + while ((req = elv_next_request(rq)) != NULL) { + info = req->rq_disk->private_data; + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (RING_FULL(&info->ring)) + goto wait; + + DPRINTK("do_blk_req %p: cmd %p, sec %llx, " + "(%u/%li) buffer:%p [%s]\n", + req, req->cmd, (long long)req->sector, + req->current_nr_sectors, + req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read"); + + + blkdev_dequeue_request(req); + if (blkif_queue_request(req)) { + blk_requeue_request(rq, req); + wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } + + queued++; + } + + if (queued != 0) + flush_requests(info); +} + + +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct request *req; + blkif_response_t *bret; + RING_IDX i, rp; + unsigned long flags; + struct blkfront_info *info = (struct blkfront_info *)dev_id; + int uptodate; + + spin_lock_irqsave(&blkif_io_lock, flags); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + spin_unlock_irqrestore(&blkif_io_lock, flags); + return IRQ_HANDLED; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + int ret; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + req = (struct request *)info->shadow[id].request; + + blkif_completion(&info->shadow[id]); + + ADD_ID_TO_FREELIST(info, id); + + uptodate = (bret->status == BLKIF_RSP_OKAY); + switch (bret->operation) { + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk("blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + uptodate = -EOPNOTSUPP; + info->feature_barrier = 0; + xlvbd_barrier(info); + } + /* fall through */ + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(bret->status != BLKIF_RSP_OKAY)) + DPRINTK("Bad return from blkdev data " + "request: %x\n", bret->status); + + ret = end_that_request_first(req, uptodate, + req->hard_nr_sectors); + BUG_ON(ret); + end_that_request_last(req, uptodate); + break; + default: + BUG(); + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else + info->ring.sring->rsp_event = i + 1; + + kick_pending_request_queues(info); + + spin_unlock_irqrestore(&blkif_io_lock, flags); + + return IRQ_HANDLED; +} + +static void blkif_free(struct blkfront_info *info, int suspend) +{ + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&blkif_io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irq(&blkif_io_lock); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, + (unsigned long)info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +} + +static void blkif_completion(struct blk_shadow *s) +{ + int i; + for (i = 0; i < s->req.nr_segments; i++) + gnttab_end_foreign_access(s->req.seg[i].gref, 0UL); +} + +static void blkif_recover(struct blkfront_info *info) +{ + int i; + blkif_request_t *req; + struct blk_shadow *copy; + int j; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmalloc(sizeof(info->shadow), GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH); + memcpy(copy, info->shadow, sizeof(info->shadow)); + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Stage 3: Find pending requests and requeue them. */ + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (copy[i].request == 0) + continue; + + /* Grab a request slot and copy shadow state into it. */ + req = RING_GET_REQUEST( + &info->ring, info->ring.req_prod_pvt); + *req = copy[i].req; + + /* We get a new request id, and must reset the shadow state. */ + req->id = GET_ID_FROM_FREELIST(info); + memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + + /* Rewrite any grant references invalidated by susp/resume. */ + for (j = 0; j < req->nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->id].frame[j]), + rq_data_dir((struct request *) + info->shadow[req->id].request) ? + GTF_readonly : 0); + info->shadow[req->id].req = *req; + + info->ring.req_prod_pvt++; + } + + kfree(copy); + + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + + spin_lock_irq(&blkif_io_lock); + + /* Now safe for us to use the shared ring */ + info->connected = BLKIF_STATE_CONNECTED; + + /* Send off requeued requests */ + flush_requests(info); + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + + spin_unlock_irq(&blkif_io_lock); +} + +int blkfront_is_ready(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev.driver_data; + + return info->is_ready && info->xbdev; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; +MODULE_ALIAS("xen:vbd"); + +static struct xenbus_driver blkfront = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .otherend_changed = backend_changed, + .is_ready = blkfront_is_ready, +}; + + +static int __init xlblk_init(void) +{ + if (!is_running_on_xen()) + return -ENODEV; + + return xenbus_register_frontend(&blkfront); +} +module_init(xlblk_init); + + +static void __exit xlblk_exit(void) +{ + return xenbus_unregister_driver(&blkfront); +} +module_exit(xlblk_exit); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkfront/block.h 2010-02-24 13:13:46.000000000 +0100 @@ -0,0 +1,160 @@ +/****************************************************************************** + * block.h + * + * Shared definitions between all levels of XenLinux Virtual block devices. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004-2005, Christian Limpach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a) + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct xlbd_type_info +{ + int partn_shift; + int disks_per_major; + char *devname; + char *diskname; +}; + +struct xlbd_major_info +{ + int major; + int index; + int usage; + struct xlbd_type_info *type; + struct xlbd_minor_state *minors; +}; + +struct blk_shadow { + blkif_request_t req; + unsigned long request; + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + struct xenbus_device *xbdev; + dev_t dev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + int connected; + int ring_ref; + blkif_front_ring_t ring; + struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int irq; + struct xlbd_major_info *mi; + request_queue_t *rq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + unsigned long shadow_free; + int feature_barrier; + int is_ready; + + /** + * The number of people holding this device open. We won't allow a + * hot-unplug unless this is 0. + */ + int users; +}; + +extern spinlock_t blkif_io_lock; + +extern int blkif_open(struct inode *inode, struct file *filep); +extern int blkif_release(struct inode *inode, struct file *filep); +extern int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument); +extern int blkif_getgeo(struct block_device *, struct hd_geometry *); +extern int blkif_check(dev_t dev); +extern int blkif_revalidate(dev_t dev); +extern void do_blkif_request (request_queue_t *rq); + +/* Virtual block-device subsystem. */ +/* Note that xlvbd_add doesn't call add_disk for you: you're expected + to call add_disk on info->gd once the disk is properly connected + up. */ +int xlvbd_add(blkif_sector_t capacity, int device, + u16 vdisk_info, u16 sector_size, struct blkfront_info *info); +void xlvbd_del(struct blkfront_info *info); +int xlvbd_barrier(struct blkfront_info *info); + +#ifdef CONFIG_SYSFS +int xlvbd_sysfs_addif(struct blkfront_info *info); +void xlvbd_sysfs_delif(struct blkfront_info *info); +#else +static inline int xlvbd_sysfs_addif(struct blkfront_info *info) +{ + return 0; +} + +static inline void xlvbd_sysfs_delif(struct blkfront_info *info) +{ + ; +} +#endif + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blkfront/vbd.c 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,553 @@ +/****************************************************************************** + * vbd.c + * + * XenLinux virtual block-device driver (xvd). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004-2005, Christian Limpach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "block.h" +#include +#include + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include +#endif + +#define BLKIF_MAJOR(dev) ((dev)>>8) +#define BLKIF_MINOR(dev) ((dev) & 0xff) + +#define EXT_SHIFT 28 +#define EXTENDED (1<= KERNEL_VERSION(2,6,16) + .getgeo = blkif_getgeo +#endif +}; + +DEFINE_SPINLOCK(blkif_io_lock); + +static struct xlbd_major_info * +xlbd_alloc_major_info(int major, int minor, int index) +{ + struct xlbd_major_info *ptr; + struct xlbd_minor_state *minors; + int do_register; + + ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); + if (ptr == NULL) + return NULL; + + ptr->major = major; + minors = kmalloc(sizeof(*minors), GFP_KERNEL); + if (minors == NULL) { + kfree(ptr); + return NULL; + } + + minors->bitmap = kzalloc(BITS_TO_LONGS(256) * sizeof(*minors->bitmap), + GFP_KERNEL); + if (minors->bitmap == NULL) { + kfree(minors); + kfree(ptr); + return NULL; + } + + spin_lock_init(&minors->lock); + minors->nr = 256; + do_register = 1; + + switch (index) { + case XLBD_MAJOR_IDE_RANGE: + ptr->type = &xlbd_ide_type; + ptr->index = index - XLBD_MAJOR_IDE_START; + break; + case XLBD_MAJOR_SCSI_RANGE: + ptr->type = &xlbd_scsi_type; + ptr->index = index - XLBD_MAJOR_SCSI_START; + break; + case XLBD_MAJOR_VBD_RANGE: + ptr->index = 0; + if ((index - XLBD_MAJOR_VBD_START) == 0) + ptr->type = &xlbd_vbd_type; + else + ptr->type = &xlbd_vbd_type_ext; + + /* + * if someone already registered block major 202, + * don't try to register it again + */ + if (major_info[XLBD_MAJOR_VBD_ALT(index)] != NULL) { + kfree(minors->bitmap); + kfree(minors); + minors = major_info[XLBD_MAJOR_VBD_ALT(index)]->minors; + do_register = 0; + } + break; + } + + if (do_register) { + if (register_blkdev(ptr->major, ptr->type->devname)) { + kfree(minors->bitmap); + kfree(minors); + kfree(ptr); + return NULL; + } + + printk("xen-vbd: registered block device major %i\n", ptr->major); + } + + ptr->minors = minors; + major_info[index] = ptr; + return ptr; +} + +static struct xlbd_major_info * +xlbd_get_major_info(int major, int minor, int vdevice) +{ + struct xlbd_major_info *mi; + int index; + + switch (major) { + case IDE0_MAJOR: index = 0; break; + case IDE1_MAJOR: index = 1; break; + case IDE2_MAJOR: index = 2; break; + case IDE3_MAJOR: index = 3; break; + case IDE4_MAJOR: index = 4; break; + case IDE5_MAJOR: index = 5; break; + case IDE6_MAJOR: index = 6; break; + case IDE7_MAJOR: index = 7; break; + case IDE8_MAJOR: index = 8; break; + case IDE9_MAJOR: index = 9; break; + case SCSI_DISK0_MAJOR: index = 10; break; + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: + index = 11 + major - SCSI_DISK1_MAJOR; + break; + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: + index = 18 + major - SCSI_DISK8_MAJOR; + break; + case SCSI_CDROM_MAJOR: index = 26; break; + default: + if (!VDEV_IS_EXTENDED(vdevice)) + index = 27; + else + index = 28; + break; + } + + mi = ((major_info[index] != NULL) ? major_info[index] : + xlbd_alloc_major_info(major, minor, index)); + if (mi) + mi->usage++; + return mi; +} + +static void +xlbd_put_major_info(struct xlbd_major_info *mi) +{ + mi->usage--; + /* XXX: release major if 0 */ +} + +static int +xlbd_reserve_minors(struct xlbd_major_info *mi, unsigned int minor, + unsigned int nr_minors) +{ + struct xlbd_minor_state *ms = mi->minors; + unsigned int end = minor + nr_minors; + int rc; + + if (end > ms->nr) { + unsigned long *bitmap, *old; + + bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap), + GFP_KERNEL); + if (bitmap == NULL) + return -ENOMEM; + + spin_lock(&ms->lock); + if (end > ms->nr) { + old = ms->bitmap; + memcpy(bitmap, ms->bitmap, + BITS_TO_LONGS(ms->nr) * sizeof(*bitmap)); + ms->bitmap = bitmap; + ms->nr = BITS_TO_LONGS(end) * BITS_PER_LONG; + } else + old = bitmap; + spin_unlock(&ms->lock); + kfree(old); + } + + spin_lock(&ms->lock); + if (find_next_bit(ms->bitmap, end, minor) >= end) { + for (; minor < end; ++minor) + __set_bit(minor, ms->bitmap); + rc = 0; + } else + rc = -EBUSY; + spin_unlock(&ms->lock); + + return rc; +} + +static void +xlbd_release_minors(struct xlbd_major_info *mi, unsigned int minor, + unsigned int nr_minors) +{ + struct xlbd_minor_state *ms = mi->minors; + unsigned int end = minor + nr_minors; + + BUG_ON(end > ms->nr); + spin_lock(&ms->lock); + for (; minor < end; ++minor) + __clear_bit(minor, ms->bitmap); + spin_unlock(&ms->lock); +} + +static int +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +{ + request_queue_t *rq; + + rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + if (rq == NULL) + return -1; + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + /* Make sure we don't use bounce buffers. */ + blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); + + gd->queue = rq; + + return 0; +} + +static int +xlvbd_alloc_gendisk(int major, int minor, blkif_sector_t capacity, int vdevice, + u16 vdisk_info, u16 sector_size, + struct blkfront_info *info) +{ + struct gendisk *gd; + struct xlbd_major_info *mi; + int nr_minors = 1; + int err = -ENODEV; + unsigned int offset; + + BUG_ON(info->gd != NULL); + BUG_ON(info->mi != NULL); + BUG_ON(info->rq != NULL); + + mi = xlbd_get_major_info(major, minor, vdevice); + if (mi == NULL) + goto out; + info->mi = mi; + + if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) + nr_minors = 1 << mi->type->partn_shift; + + err = xlbd_reserve_minors(mi, minor, nr_minors); + if (err) + goto out; + err = -ENODEV; + + gd = alloc_disk(nr_minors); + if (gd == NULL) + goto release; + + offset = mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift); + if (nr_minors > 1) { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c", + mi->type->diskname, 'a' + offset ); + } + else { + sprintf(gd->disk_name, "%s%c%c", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26) ); + } + } + else { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c%d", + mi->type->diskname, + 'a' + offset, + minor & ((1 << mi->type->partn_shift) - 1)); + } + else { + sprintf(gd->disk_name, "%s%c%c%d", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26), + minor & ((1 << mi->type->partn_shift) - 1)); + } + } + + gd->major = mi->major; + gd->first_minor = minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = info; + gd->driverfs_dev = &(info->xbdev->dev); + set_capacity(gd, capacity); + + if (xlvbd_init_blk_queue(gd, sector_size)) { + del_gendisk(gd); + goto release; + } + + info->rq = gd->queue; + info->gd = gd; + + if (info->feature_barrier) + xlvbd_barrier(info); + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); + + if (vdisk_info & VDISK_REMOVABLE) + gd->flags |= GENHD_FL_REMOVABLE; + + if (vdisk_info & VDISK_CDROM) + gd->flags |= GENHD_FL_CD; + + return 0; + + release: + xlbd_release_minors(mi, minor, nr_minors); + out: + if (mi) + xlbd_put_major_info(mi); + info->mi = NULL; + return err; +} + +int +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, + u16 sector_size, struct blkfront_info *info) +{ + struct block_device *bd; + int err = 0; + int major, minor; + + if ((vdevice>>EXT_SHIFT) > 1) { + /* this is above the extended range; something is wrong */ + printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", vdevice); + return -ENODEV; + } + + if (!VDEV_IS_EXTENDED(vdevice)) { + major = BLKIF_MAJOR(vdevice); + minor = BLKIF_MINOR(vdevice); + } + else { + major = 202; + minor = BLKIF_MINOR_EXT(vdevice); + } + + info->dev = MKDEV(major, minor); + bd = bdget(info->dev); + if (bd == NULL) + return -ENODEV; + + err = xlvbd_alloc_gendisk(major, minor, capacity, vdevice, vdisk_info, + sector_size, info); + + bdput(bd); + return err; +} + +void +xlvbd_del(struct blkfront_info *info) +{ + unsigned int minor, nr_minors; + + if (info->mi == NULL) + return; + + BUG_ON(info->gd == NULL); + minor = info->gd->first_minor; + nr_minors = info->gd->minors; + del_gendisk(info->gd); + put_disk(info->gd); + info->gd = NULL; + + xlbd_release_minors(info->mi, minor, nr_minors); + xlbd_put_major_info(info->mi); + info->mi = NULL; + + BUG_ON(info->rq == NULL); + blk_cleanup_queue(info->rq); + info->rq = NULL; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +int +xlvbd_barrier(struct blkfront_info *info) +{ + int err; + + err = blk_queue_ordered(info->rq, + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL); + if (err) + return err; + printk(KERN_INFO "blkfront: %s: barriers %s\n", + info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled"); + return 0; +} +#else +int +xlvbd_barrier(struct blkfront_info *info) +{ + printk(KERN_INFO "blkfront: %s: barriers disabled\n", info->gd->disk_name); + return -ENOSYS; +} +#endif + +#ifdef CONFIG_SYSFS +static ssize_t show_media(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct blkfront_info *info = xendev->dev.driver_data; + + if (info->gd->flags & GENHD_FL_CD) + return sprintf(buf, "cdrom\n"); + return sprintf(buf, "disk\n"); +} + +static struct device_attribute xlvbd_attrs[] = { + __ATTR(media, S_IRUGO, show_media, NULL), +}; + +int xlvbd_sysfs_addif(struct blkfront_info *info) +{ + int i; + int error = 0; + + for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) { + error = device_create_file(info->gd->driverfs_dev, + &xlvbd_attrs[i]); + if (error) + goto fail; + } + return 0; + +fail: + while (--i >= 0) + device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]); + return error; +} + +void xlvbd_sysfs_delif(struct blkfront_info *info) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) + device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]); +} + +#endif /* CONFIG_SYSFS */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,5 @@ +LINUXINCLUDE += -I../xen/include/public/io + +obj-$(CONFIG_XEN_BLKDEV_TAP) := xenblktap.o + +xenblktap-y := xenbus.o interface.o blktap.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap/blktap.c 2011-02-17 09:58:10.000000000 +0100 @@ -0,0 +1,1761 @@ +/****************************************************************************** + * drivers/xen/blktap/blktap.c + * + * Back-end driver for user level virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. Requests + * are remapped to a user-space memory region. + * + * Based on the blkback driver code. + * + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield + * + * Clean ups and fix ups: + * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "common.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ +#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) +static int mmap_pages = MMAP_PAGES; + +#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we + * have a bunch of pages reserved for shared + * memory rings. + */ + +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + +typedef struct domid_translate_ext { + unsigned short domid; + u32 busid; +} domid_translate_ext_t ; + +/*Data struct associated with each of the tapdisk devices*/ +typedef struct tap_blkif { + struct mm_struct *mm; /*User address space */ + unsigned long rings_vstart; /*Kernel memory mapping */ + unsigned long user_vstart; /*User memory mapping */ + unsigned long dev_inuse; /*One process opens device at a time. */ + unsigned long dev_pending; /*In process of being opened */ + unsigned long ring_ok; /*make this ring->state */ + blkif_front_ring_t ufe_ring; /*Rings up to user space. */ + wait_queue_head_t wait; /*for poll */ + unsigned long mode; /*current switching mode */ + int minor; /*Minor number for tapdisk device */ + pid_t pid; /*tapdisk process id */ + enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace + shutdown */ + struct idx_map { + u16 mem, req; + } *idx_map; /*Record the user ring id to kern + [req id, idx] tuple */ + blkif_t *blkif; /*Associate blkif with tapdev */ + struct domid_translate_ext trans; /*Translation from domid to bus. */ + struct vm_foreign_map foreign_map; /*Mapping page */ +} tap_blkif_t; + +static struct tap_blkif *tapfds[MAX_TAP_DEV]; +static int blktap_next_minor; + +/* Run-time switchable: /sys/module/blktap/parameters/ */ +static unsigned int log_stats = 0; +static unsigned int debug_lvl = 0; +module_param(log_stats, int, 0644); +module_param(debug_lvl, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + u64 id; + unsigned short mem_idx; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; + int inuse; +} pending_req_t; + +static pending_req_t *pending_reqs[MAX_PENDING_REQS]; +static struct list_head pending_free; +static DEFINE_SPINLOCK(pending_free_lock); +static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq); +static int alloc_pending_reqs; + +static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { + return (req - pending_reqs[idx]); +} + +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +#define BLKBACK_INVALID_HANDLE (~0) + +static struct page **foreign_pages[MAX_DYNAMIC_MEM]; +static inline struct page *idx_to_page( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx; + return foreign_pages[mmap_idx][arr_idx]; +} +static inline unsigned long idx_to_kaddr( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned short mmap_alloc = 0; +static unsigned short mmap_lock = 0; +static unsigned short mmap_inuse = 0; + +/****************************************************************** + * GRANT HANDLES + */ + +/* When using grant tables to map a frame for device access then the + * handle returned must be used to unmap the frame. This is needed to + * drop the ref count on the frame. + */ +struct grant_handle_pair +{ + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +static struct grant_handle_pair + pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; +#define pending_handle(_id, _idx, _i) \ + (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \ + + (_i)]) + + +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/ + +#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */ +#define BLKTAP_DEV_DIR "/dev/xen" + +static int blktap_major; + +/* blktap IOCTLs: */ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_SENDPID 4 +#define BLKTAP_IOCTL_NEWINTF 5 +#define BLKTAP_IOCTL_MINOR 6 +#define BLKTAP_IOCTL_MAJOR 7 +#define BLKTAP_QUERY_ALLOC_REQS 8 +#define BLKTAP_IOCTL_FREEINTF 9 +#define BLKTAP_IOCTL_NEWINTF_EXT 50 +#define BLKTAP_IOCTL_PRINT_IDXS 100 + +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ((arg == BLKTAP_MODE_PASSTHROUGH ) || + (arg == BLKTAP_MODE_INTERCEPT_FE) || + (arg == BLKTAP_MODE_INTERPOSE )); +} + +/* Requests passing through the tap to userspace are re-assigned an ID. + * We must record a mapping between the BE [IDX,ID] tuple and the userspace + * ring ID. + */ + +#define INVALID_MIDX 0xdead + +/*TODO: Convert to a free list*/ +static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map) +{ + unsigned int i; + + for (i = 0; i < MAX_PENDING_REQS; i++) + if (idx_map[i].mem == INVALID_MIDX) + break; + + return i; +} + +static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset) +{ + return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; +} + +static inline unsigned int OFFSET_TO_SEG(unsigned long offset) +{ + return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; +} + + +#define BLKTAP_INVALID_HANDLE(_g) \ + (((_g->kernel) == INVALID_GRANT_HANDLE) && \ + ((_g->user) == INVALID_GRANT_HANDLE)) + +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ + (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \ + } while(0) + + +/****************************************************************** + * BLKTAP VM OPS + */ + +static struct page *blktap_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +static pte_t blktap_clear_pte(struct vm_area_struct *vma, + unsigned long uvaddr, + pte_t *ptep, int is_fullmm) +{ + pte_t copy; + tap_blkif_t *info = NULL; + unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0; + unsigned long offset, uvstart = 0; + struct page *pg; + struct grant_handle_pair *khandle; + struct gnttab_unmap_grant_ref unmap[2]; + + /* + * If the address is before the start of the grant mapped region or + * if vm_file is NULL (meaning mmap failed and we have nothing to do) + */ + if (vma->vm_file != NULL) { + info = vma->vm_file->private_data; + uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); + } + if (vma->vm_file == NULL || uvaddr < uvstart) + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, + ptep, is_fullmm); + + /* TODO Should these be changed to if statements? */ + BUG_ON(!info); + BUG_ON(!info->idx_map); + + offset = (uvaddr - uvstart) >> PAGE_SHIFT; + usr_idx = OFFSET_TO_USR_IDX(offset); + seg = OFFSET_TO_SEG(offset); + + pending_idx = info->idx_map[usr_idx].req; + mmap_idx = info->idx_map[usr_idx].mem; + + pg = idx_to_page(mmap_idx, pending_idx, seg); + ClearPageReserved(pg); + info->foreign_map.map[offset + RING_PAGES] = NULL; + + khandle = &pending_handle(mmap_idx, pending_idx, seg); + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + unsigned long pfn = page_to_pfn(pg); + + gnttab_set_unmap_op(&unmap[count], + (unsigned long)pfn_to_kaddr(pfn), + GNTMAP_host_map, khandle->kernel); + count++; + + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + + copy = *ptep; + gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep), + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + count++; + } else { + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)); + + /* USING SHADOW PAGE TABLES. */ + copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + } + + if (count) { + BLKTAP_INVALIDATE_HANDLE(khandle); + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, count)) + BUG(); + } + + return copy; +} + +static void blktap_vma_open(struct vm_area_struct *vma) +{ + tap_blkif_t *info; + if (vma->vm_file == NULL) + return; + + info = vma->vm_file->private_data; + vma->vm_private_data = + &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT]; +} + +/* tricky part + * When partial munmapping, ->open() is called only splitted vma which + * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c + * So there is no chance to fix up vm_private_data of the end vma. + */ +static void blktap_vma_close(struct vm_area_struct *vma) +{ + tap_blkif_t *info; + struct vm_area_struct *next = vma->vm_next; + + if (next == NULL || + vma->vm_ops != next->vm_ops || + vma->vm_end != next->vm_start || + vma->vm_file == NULL || + vma->vm_file != next->vm_file) + return; + + info = vma->vm_file->private_data; + next->vm_private_data = + &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT]; +} + +static struct vm_operations_struct blktap_vm_ops = { + nopage: blktap_nopage, + zap_pte: blktap_clear_pte, + open: blktap_vma_open, + close: blktap_vma_close, +}; + +/****************************************************************** + * BLKTAP FILE OPS + */ + +/*Function Declarations*/ +static tap_blkif_t *get_next_free_dev(void); +static int blktap_open(struct inode *inode, struct file *filp); +static int blktap_release(struct inode *inode, struct file *filp); +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); +static unsigned int blktap_poll(struct file *file, poll_table *wait); + +static const struct file_operations blktap_fops = { + .owner = THIS_MODULE, + .poll = blktap_poll, + .ioctl = blktap_ioctl, + .open = blktap_open, + .release = blktap_release, + .mmap = blktap_mmap, +}; + + +static tap_blkif_t *get_next_free_dev(void) +{ + struct class *class; + tap_blkif_t *info; + int minor; + + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&pending_free_lock); + + /* tapfds[0] is always NULL */ + + for (minor = 1; minor < blktap_next_minor; minor++) { + info = tapfds[minor]; + /* we could have failed a previous attempt. */ + if (!info || + ((!test_bit(0, &info->dev_inuse)) && + (info->dev_pending == 0)) ) { + info->dev_pending = 1; + goto found; + } + } + info = NULL; + minor = -1; + + /* + * We didn't find free device. If we can still allocate + * more, then we grab the next device minor that is + * available. This is done while we are still under + * the protection of the pending_free_lock. + */ + if (blktap_next_minor < MAX_TAP_DEV) + minor = blktap_next_minor++; +found: + spin_unlock_irq(&pending_free_lock); + + if (!info && minor > 0) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (unlikely(!info)) { + /* + * If we failed here, try to put back + * the next minor number. But if one + * was just taken, then we just lose this + * minor. We can try to allocate this + * minor again later. + */ + spin_lock_irq(&pending_free_lock); + if (blktap_next_minor == minor+1) + blktap_next_minor--; + spin_unlock_irq(&pending_free_lock); + goto out; + } + + info->minor = minor; + /* + * Make sure that we have a minor before others can + * see us. + */ + wmb(); + tapfds[minor] = info; + + if ((class = get_xen_class()) != NULL) + class_device_create(class, NULL, + MKDEV(blktap_major, minor), NULL, + "blktap%d", minor); + } + +out: + return info; +} + +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) +{ + tap_blkif_t *info; + int i; + + for (i = 1; i < blktap_next_minor; i++) { + info = tapfds[i]; + if ( info && + (info->trans.domid == domid) && + (info->trans.busid == xenbus_id) ) { + info->blkif = blkif; + info->status = RUNNING; + return i; + } + } + return -1; +} + +void signal_tapdisk(int idx) +{ + tap_blkif_t *info; + struct task_struct *ptask; + struct mm_struct *mm; + + /* + * if the userland tools set things up wrong, this could be negative; + * just don't try to signal in this case + */ + if (idx < 0 || idx >= MAX_TAP_DEV) + return; + + info = tapfds[idx]; + if (!info) + return; + + if (info->pid > 0) { + ptask = find_task_by_pid(info->pid); + if (ptask) + info->status = CLEANSHUTDOWN; + } + info->blkif = NULL; + + mm = xchg(&info->mm, NULL); + if (mm) + mmput(mm); +} + +static int blktap_open(struct inode *inode, struct file *filp) +{ + blkif_sring_t *sring; + int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info; + int i; + + /* ctrl device, treat differently */ + if (!idx) + return 0; + if (idx < 0 || idx >= MAX_TAP_DEV) { + WPRINTK("No device /dev/xen/blktap%d\n", idx); + return -ENODEV; + } + + info = tapfds[idx]; + if (!info) { + WPRINTK("Unable to open device /dev/xen/blktap%d\n", + idx); + return -ENODEV; + } + + DPRINTK("Opening device /dev/xen/blktap%d\n",idx); + + /*Only one process can access device at a time*/ + if (test_and_set_bit(0, &info->dev_inuse)) + return -EBUSY; + + info->dev_pending = 0; + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (sring == NULL) + goto fail_nomem; + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); + + filp->private_data = info; + info->mm = NULL; + + info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS, + GFP_KERNEL); + + if (info->idx_map == NULL) + goto fail_nomem; + + if (idx > 0) { + init_waitqueue_head(&info->wait); + for (i = 0; i < MAX_PENDING_REQS; i++) { + info->idx_map[i].mem = INVALID_MIDX; + info->idx_map[i].req = ~0; + } + } + + DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx); + return 0; + + fail_nomem: + return -ENOMEM; +} + +static int blktap_release(struct inode *inode, struct file *filp) +{ + tap_blkif_t *info = filp->private_data; + struct mm_struct *mm; + + /* check for control device */ + if (!info) + return 0; + + info->ring_ok = 0; + smp_wmb(); + + mm = xchg(&info->mm, NULL); + if (mm) + mmput(mm); + kfree(info->foreign_map.map); + info->foreign_map.map = NULL; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(info->ufe_ring.sring)); + free_page((unsigned long) info->ufe_ring.sring); + + if (info->idx_map) { + kfree(info->idx_map); + info->idx_map = NULL; + } + + if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { + if (info->blkif->xenblkd != NULL) { + kthread_stop(info->blkif->xenblkd); + info->blkif->xenblkd = NULL; + } + info->status = CLEANSHUTDOWN; + } + + clear_bit(0, &info->dev_inuse); + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor); + + return 0; +} + + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size; + tap_blkif_t *info = filp->private_data; + int ret; + + if (info == NULL) { + WPRINTK("blktap: mmap, retrieving idx failed\n"); + return -ENOMEM; + } + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_vm_ops; + + size = vma->vm_end - vma->vm_start; + if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) { + WPRINTK("you _must_ map exactly %d pages!\n", + mmap_pages + RING_PAGES); + return -EAGAIN; + } + + size >>= PAGE_SHIFT; + info->rings_vstart = vma->vm_start; + info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + ret = vm_insert_page(vma, vma->vm_start, + virt_to_page(info->ufe_ring.sring)); + else + ret = remap_pfn_range(vma, vma->vm_start, + __pa(info->ufe_ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (ret) { + WPRINTK("Mapping user ring failed!\n"); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) * + sizeof(*info->foreign_map.map), GFP_KERNEL); + if (info->foreign_map.map == NULL) { + WPRINTK("Couldn't alloc VM_FOREIGN map.\n"); + goto fail; + } + + vma->vm_private_data = &info->foreign_map; + vma->vm_flags |= VM_FOREIGN; + vma->vm_flags |= VM_DONTCOPY; + +#ifdef CONFIG_X86 + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + info->mm = get_task_mm(current); + smp_wmb(); + info->ring_ok = 1; + return 0; + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + + return -ENOMEM; +} + + +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + tap_blkif_t *info = filp->private_data; + + switch(cmd) { + case BLKTAP_IOCTL_KICK_FE: + { + /* There are fe messages to process. */ + return blktap_read_ufe_ring(info); + } + case BLKTAP_IOCTL_SETMODE: + { + if (info) { + if (BLKTAP_MODE_VALID(arg)) { + info->mode = arg; + /* XXX: may need to flush rings here. */ + DPRINTK("blktap: set mode to %lx\n", + arg); + return 0; + } + } + return 0; + } + case BLKTAP_IOCTL_PRINT_IDXS: + { + if (info) { + printk("User Rings: \n-----------\n"); + printk("UF: rsp_cons: %2d, req_prod_prv: %2d " + "| req_prod: %2d, rsp_prod: %2d\n", + info->ufe_ring.rsp_cons, + info->ufe_ring.req_prod_pvt, + info->ufe_ring.sring->req_prod, + info->ufe_ring.sring->rsp_prod); + } + return 0; + } + case BLKTAP_IOCTL_SENDPID: + { + if (info) { + info->pid = (pid_t)arg; + DPRINTK("blktap: pid received %d\n", + info->pid); + } + return 0; + } + case BLKTAP_IOCTL_NEWINTF: + { + uint64_t val = (uint64_t)arg; + domid_translate_t *tr = (domid_translate_t *)&val; + + DPRINTK("NEWINTF Req for domid %d and bus id %d\n", + tr->domid, tr->busid); + info = get_next_free_dev(); + if (!info) { + WPRINTK("Error initialising /dev/xen/blktap - " + "No more devices\n"); + return -1; + } + info->trans.domid = tr->domid; + info->trans.busid = tr->busid; + return info->minor; + } + case BLKTAP_IOCTL_NEWINTF_EXT: + { + void __user *udata = (void __user *) arg; + domid_translate_ext_t tr; + + if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t))) + return -EFAULT; + + DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", + tr.domid, tr.busid); + info = get_next_free_dev(); + if (!info) { + WPRINTK("Error initialising /dev/xen/blktap - " + "No more devices\n"); + return -1; + } + info->trans.domid = tr.domid; + info->trans.busid = tr.busid; + return info->minor; + } + case BLKTAP_IOCTL_FREEINTF: + { + unsigned long dev = arg; + unsigned long flags; + + if (info || dev >= MAX_TAP_DEV) + return -EINVAL; + + info = tapfds[dev]; + if (!info) + return 0; /* should this be an error? */ + + spin_lock_irqsave(&pending_free_lock, flags); + if (info->dev_pending) + info->dev_pending = 0; + spin_unlock_irqrestore(&pending_free_lock, flags); + + return 0; + } + case BLKTAP_IOCTL_MINOR: + if (!info) { + unsigned long dev = arg; + + if (dev >= MAX_TAP_DEV) + return -EINVAL; + + info = tapfds[dev]; + if (!info) + return -EINVAL; + } + + return info->minor; + + case BLKTAP_IOCTL_MAJOR: + return blktap_major; + + case BLKTAP_QUERY_ALLOC_REQS: + WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n", + alloc_pending_reqs, MAX_PENDING_REQS); + return (alloc_pending_reqs/MAX_PENDING_REQS) * 100; + } + return -ENOIOCTLCMD; +} + +static unsigned int blktap_poll(struct file *filp, poll_table *wait) +{ + tap_blkif_t *info = filp->private_data; + + /* do not work on the control device */ + if (!info) + return 0; + + poll_wait(filp, &info->wait, wait); + if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { + RING_PUSH_REQUESTS(&info->ufe_ring); + return POLLIN | POLLRDNORM; + } + return 0; +} + +static void blktap_kick_user(int idx) +{ + tap_blkif_t *info; + + if (idx < 0 || idx >= MAX_TAP_DEV) + return; + + info = tapfds[idx]; + if (!info) + return; + + wake_up_interruptible(&info->wait); + + return; +} + +static int do_block_io_op(blkif_t *blkif); +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req); +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st); + +/****************************************************************** + * misc small helpers + */ +static int req_increase(void) +{ + int i, j; + + if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) + return -EINVAL; + + pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) + * MAX_PENDING_REQS, GFP_KERNEL); + foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages); + + if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc]) + goto out_of_memory; + + DPRINTK("%s: reqs=%lu, pages=%d\n", + __FUNCTION__, MAX_PENDING_REQS, mmap_pages); + + for (i = 0; i < MAX_PENDING_REQS; i++) { + list_add_tail(&pending_reqs[mmap_alloc][i].free_list, + &pending_free); + pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc; + for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) + BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, + i, j)); + } + + mmap_alloc++; + DPRINTK("# MMAPs increased to %d\n",mmap_alloc); + return 0; + + out_of_memory: + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + kfree(pending_reqs[mmap_alloc]); + WPRINTK("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; +} + +static void mmap_req_del(int mmap) +{ + assert_spin_locked(&pending_free_lock); + + kfree(pending_reqs[mmap]); + pending_reqs[mmap] = NULL; + + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + foreign_pages[mmap] = NULL; + + mmap_lock = 0; + DPRINTK("# MMAPs decreased to %d\n",mmap_alloc); + mmap_alloc--; +} + +static pending_req_t* alloc_req(void) +{ + pending_req_t *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&pending_free_lock, flags); + + if (!list_empty(&pending_free)) { + req = list_entry(pending_free.next, pending_req_t, free_list); + list_del(&req->free_list); + } + + if (req) { + req->inuse = 1; + alloc_pending_reqs++; + } + spin_unlock_irqrestore(&pending_free_lock, flags); + + return req; +} + +static void free_req(pending_req_t *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&pending_free_lock, flags); + + alloc_pending_reqs--; + req->inuse = 0; + if (mmap_lock && (req->mem_idx == mmap_alloc-1)) { + mmap_inuse--; + if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); + spin_unlock_irqrestore(&pending_free_lock, flags); + return; + } + was_empty = list_empty(&pending_free); + list_add(&req->free_list, &pending_free); + + spin_unlock_irqrestore(&pending_free_lock, flags); + + if (was_empty) + wake_up(&pending_free_wq); +} + +static void blktap_zap_page_range(struct mm_struct *mm, + unsigned long uvaddr, int nr_pages) +{ + unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT); + struct vm_area_struct *vma; + + vma = find_vma(mm, uvaddr); + while (vma && uvaddr < end) { + unsigned long s = max(uvaddr, vma->vm_start); + unsigned long e = min(end, vma->vm_end); + + zap_page_range(vma, s, e - s, NULL); + + uvaddr = e; + vma = vma->vm_next; + } +} + +static void fast_flush_area(pending_req_t *req, unsigned int k_idx, + unsigned int u_idx, int tapidx) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, mmap_idx, invcount = 0, locked = 0; + struct grant_handle_pair *khandle; + uint64_t ptep; + int ret; + unsigned long uvaddr; + tap_blkif_t *info; + struct mm_struct *mm; + + + if ((tapidx < 0) || (tapidx >= MAX_TAP_DEV) + || !(info = tapfds[tapidx])) { + WPRINTK("fast_flush: Couldn't get info!\n"); + return; + } + + mm = info->mm; + + if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) { + down_write(&mm->mmap_sem); + blktap_zap_page_range(mm, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages); + up_write(&mm->mmap_sem); + return; + } + + mmap_idx = req->mem_idx; + + for (i = 0; i < req->nr_pages; i++) { + uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); + + khandle = &pending_handle(mmap_idx, k_idx, i); + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[invcount], + idx_to_kaddr(mmap_idx, k_idx, i), + GNTMAP_host_map, khandle->kernel); + invcount++; + + set_phys_to_machine( + page_to_pfn(idx_to_page(mmap_idx, k_idx, i)), + INVALID_P2M_ENTRY); + } + + if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (!locked++) + down_write(&mm->mmap_sem); + if (create_lookup_pte_addr( + mm, + MMAP_VADDR(info->user_vstart, u_idx, i), + &ptep) !=0) { + up_write(&mm->mmap_sem); + WPRINTK("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[invcount], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + invcount++; + } + + BLKTAP_INVALIDATE_HANDLE(khandle); + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + + if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) { + if (!locked++) + down_write(&mm->mmap_sem); + blktap_zap_page_range(mm, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages); + } + + if (locked) + up_write(&mm->mmap_sem); +} + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void print_stats(blkif_t *blkif) +{ + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int tap_blkif_schedule(void *arg) +{ + blkif_t *blkif = arg; + tap_blkif_t *info; + + blkif_get(blkif); + + if (debug_lvl) + printk(KERN_DEBUG "%s: started\n", current->comm); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + pending_free_wq, + !list_empty(&pending_free) || kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + if (debug_lvl) + printk(KERN_DEBUG "%s: exiting\n", current->comm); + + blkif->xenblkd = NULL; + info = tapfds[blkif->dev_num]; + blkif_put(blkif); + + if (info) { + struct mm_struct *mm = xchg(&info->mm, NULL); + + if (mm) + mmput(mm); + } + + return 0; +} + +/****************************************************************** + * COMPLETION CALLBACK -- Called by user level ioctl() + */ + +static int blktap_read_ufe_ring(tap_blkif_t *info) +{ + /* This is called to read responses from the UFE ring. */ + RING_IDX i, j, rp; + blkif_response_t *resp; + blkif_t *blkif=NULL; + unsigned int pending_idx, usr_idx, mmap_idx; + pending_req_t *pending_req; + + if (!info) + return 0; + + /* We currently only forward packets in INTERCEPT_FE mode. */ + if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) + return 0; + + /* for each outstanding message on the UFEring */ + rp = info->ufe_ring.sring->rsp_prod; + rmb(); + + for (i = info->ufe_ring.rsp_cons; i != rp; i++) { + blkif_response_t res; + resp = RING_GET_RESPONSE(&info->ufe_ring, i); + memcpy(&res, resp, sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ + ++info->ufe_ring.rsp_cons; + + /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ + if (res.id >= MAX_PENDING_REQS) { + WPRINTK("incorrect req map [%llx]\n", + (unsigned long long)res.id); + continue; + } + + usr_idx = (unsigned int)res.id; + pending_idx = info->idx_map[usr_idx].req; + mmap_idx = info->idx_map[usr_idx].mem; + + if (mmap_idx >= mmap_alloc || + pending_idx >= MAX_PENDING_REQS) { + WPRINTK("incorrect req map [%d]," + " internal map [%d,%d]\n", + usr_idx, mmap_idx, pending_idx); + continue; + } + + pending_req = &pending_reqs[mmap_idx][pending_idx]; + blkif = pending_req->blkif; + + for (j = 0; j < pending_req->nr_pages; j++) { + + unsigned long uvaddr; + struct page *pg; + int offset; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); + + pg = idx_to_page(mmap_idx, pending_idx, j); + ClearPageReserved(pg); + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + info->foreign_map.map[offset] = NULL; + } + fast_flush_area(pending_req, pending_idx, usr_idx, info->minor); + info->idx_map[usr_idx].mem = INVALID_MIDX; + make_response(blkif, pending_req->id, res.operation, + res.status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } + + return 0; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +static void blkif_notify_work(blkif_t *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ +static int print_dbug = 1; +static int do_block_io_op(blkif_t *blkif) +{ + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + blkif_request_t req; + pending_req_t *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + tap_blkif_t *info; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /*Check blkif has corresponding UE ring*/ + if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) { + /*oops*/ + if (print_dbug) { + WPRINTK("Corresponding UE " + "ring does not exist!\n"); + print_dbug = 0; /*We only print this message once*/ + } + return 0; + } + + info = tapfds[blkif->dev_num]; + + if (!info || !test_bit(0, &info->dev_inuse)) { + if (print_dbug) { + WPRINTK("Can't get UE info!\n"); + print_dbug = 0; + } + return 0; + } + + while (rc != rp) { + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("RING_FULL! More to do\n"); + more_to_do = 1; + break; + } + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) { + WPRINTK("RING_REQUEST_CONS_OVERFLOW!" + " More to do\n"); + more_to_do = 1; + break; + } + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), + sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + + switch (req.operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, &req, pending_req); + break; + + default: + /* A good sign something is wrong: sleep for a while to + * avoid excessive CPU consumption by a bad guest. */ + msleep(1); + WPRINTK("unknown operation [%d]\n", + req.operation); + make_response(blkif, req.id, req.operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + blktap_kick_user(blkif->dev_num); + + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, + blkif_request_t *req, + pending_req_t *pending_req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int nseg; + int ret, i, nr_sects = 0; + tap_blkif_t *info; + blkif_request_t *target; + unsigned int mmap_idx = pending_req->mem_idx; + unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx); + unsigned int usr_idx; + struct mm_struct *mm; + struct vm_area_struct *vma = NULL; + + if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) + goto fail_response; + + info = tapfds[blkif->dev_num]; + if (info == NULL) + goto fail_response; + + /* Check we have space on user ring - should never fail. */ + usr_idx = GET_NEXT_REQ(info->idx_map); + if (usr_idx >= MAX_PENDING_REQS) { + WARN_ON(1); + goto fail_response; + } + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if ( unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { + WPRINTK("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + /* Make sure userspace is ready. */ + if (!info->ring_ok) { + WPRINTK("blktap: ring not ready for requests!\n"); + goto fail_response; + } + smp_rmb(); + + if (RING_FULL(&info->ufe_ring)) { + WPRINTK("blktap: fe_ring is full, can't add " + "IO Request will be dropped. %d %d\n", + RING_SIZE(&info->ufe_ring), + RING_SIZE(&blkif->blk_rings.common)); + goto fail_response; + } + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + op = 0; + mm = info->mm; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&mm->mmap_sem); + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long kvaddr; + uint64_t ptep; + uint32_t flags; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); + + flags = GNTMAP_host_map; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], kvaddr, flags, + req->seg[i].gref, blkif->domid); + op++; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Now map it to user. */ + ret = create_lookup_pte_addr(mm, uvaddr, &ptep); + if (ret) { + up_write(&mm->mmap_sem); + WPRINTK("Couldn't get a pte addr!\n"); + goto fail_flush; + } + + flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], ptep, flags, + req->seg[i].gref, blkif->domid); + op++; + } + + nr_sects += (req->seg[i].last_sect - + req->seg[i].first_sect + 1); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); + BUG_ON(ret); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + up_write(&mm->mmap_sem); + + for (i = 0; i < (nseg*2); i+=2) { + unsigned long uvaddr; + unsigned long offset; + struct page *pg; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); + + gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]); + + if (unlikely(map[i].status != GNTST_okay)) { + WPRINTK("invalid kernel buffer -- could not remap it\n"); + ret = 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + if (unlikely(map[i+1].status != GNTST_okay)) { + WPRINTK("invalid kernel buffer -- could not remap it\n"); + ret = 1; + map[i+1].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i/2).kernel + = map[i].handle; + pending_handle(mmap_idx, pending_idx, i/2).user + = map[i+1].handle; + + if (ret) + continue; + + pg = idx_to_page(mmap_idx, pending_idx, i/2); + set_phys_to_machine(page_to_pfn(pg), + FOREIGN_FRAME(map[i].dev_bus_addr + >> PAGE_SHIFT)); + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + info->foreign_map.map[offset] = pg; + } + } else { + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long offset; + struct page *pg; + + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + + gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]); + + if (unlikely(map[i].status != GNTST_okay)) { + WPRINTK("invalid kernel buffer -- could not remap it\n"); + ret = 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i).kernel + = map[i].handle; + + if (ret) + continue; + + offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT; + pg = idx_to_page(mmap_idx, pending_idx, i); + info->foreign_map.map[offset] = pg; + } + } + + if (ret) + goto fail_flush; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&mm->mmap_sem); + /* Mark mapped pages as reserved: */ + for (i = 0; i < req->nr_segments; i++) { + struct page *pg; + + pg = idx_to_page(mmap_idx, pending_idx, i); + SetPageReserved(pg); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long uvaddr = MMAP_VADDR(info->user_vstart, + usr_idx, i); + if (vma && uvaddr >= vma->vm_end) { + vma = vma->vm_next; + if (vma && + (uvaddr < vma->vm_start || + uvaddr >= vma->vm_end)) + vma = NULL; + } + if (vma == NULL) { + vma = find_vma(mm, uvaddr); + /* this virtual area was already munmapped. + so skip to next page */ + if (!vma) + continue; + } + ret = vm_insert_page(vma, uvaddr, pg); + if (ret) { + up_write(&mm->mmap_sem); + goto fail_flush; + } + } + } + if (xen_feature(XENFEAT_auto_translated_physmap)) + up_write(&mm->mmap_sem); + + /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ + info->idx_map[usr_idx].mem = mmap_idx; + info->idx_map[usr_idx].req = pending_idx; + + blkif_get(blkif); + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&info->ufe_ring, + info->ufe_ring.req_prod_pvt); + memcpy(target, req, sizeof(*req)); + target->id = usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ + info->ufe_ring.req_prod_pvt++; + + if (operation == READ) + blkif->st_rd_sect += nr_sects; + else if (operation == WRITE) + blkif->st_wr_sect += nr_sects; + + return; + + fail_flush: + WPRINTK("Reached Fail_flush\n"); + fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); + msleep(1); /* back off a bit */ +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, u64 id, + unsigned short op, int st) +{ + blkif_response_t resp; + unsigned long flags; + blkif_back_rings_t *blk_rings = &blkif->blk_rings; + int more_to_do = 0; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, + blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, + blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, + blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + + if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init blkif_init(void) +{ + int i, ret; + struct class *class; + + if (!is_running_on_xen()) + return -ENODEV; + + INIT_LIST_HEAD(&pending_free); + for(i = 0; i < 2; i++) { + ret = req_increase(); + if (ret) + break; + } + if (i == 0) + return ret; + + tap_blkif_interface_init(); + + alloc_pending_reqs = 0; + + tap_blkif_xenbus_init(); + + /* Dynamically allocate a major for this device */ + ret = register_chrdev(0, "blktap", &blktap_fops); + + if (ret < 0) { + WPRINTK("Couldn't register /dev/xen/blktap\n"); + return -ENOMEM; + } + + blktap_major = ret; + + /* tapfds[0] is always NULL */ + blktap_next_minor++; + + DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret); + + /* Make sure the xen class exists */ + if ((class = get_xen_class()) != NULL) { + /* + * This will allow udev to create the blktap ctrl device. + * We only want to create blktap0 first. We don't want + * to flood the sysfs system with needless blktap devices. + * We only create the device when a request of a new device is + * made. + */ + class_device_create(class, NULL, + MKDEV(blktap_major, 0), NULL, + "blktap0"); + } else { + /* this is bad, but not fatal */ + WPRINTK("blktap: sysfs xen_class not created\n"); + } + + DPRINTK("Blktap device successfully created\n"); + + return 0; +} + +module_init(blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap/common.h 2008-09-15 13:40:15.000000000 +0200 @@ -0,0 +1,122 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) + +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) + +struct backend_info; + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + blkif_back_rings_t blk_rings; + struct vm_struct *blk_ring_area; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + struct task_struct *xenblkd; + unsigned int waiting_reqs; + request_queue_t *plug; + + /* statistics */ + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + + wait_queue_head_t waiting_to_free; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; + + int dev_num; + uint64_t sectors; +} blkif_t; + +blkif_t *tap_alloc_blkif(domid_t domid); +void tap_blkif_free(blkif_t *blkif); +void tap_blkif_kmem_cache_free(blkif_t *blkif); +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, + unsigned int evtchn); +void tap_blkif_unmap(blkif_t *blkif); + +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; + +void tap_blkif_interface_init(void); + +void tap_blkif_xenbus_init(void); + +irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); +int tap_blkif_schedule(void *arg); + +int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif); +void signal_tapdisk(int idx); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap/interface.c 2010-09-23 15:39:04.000000000 +0200 @@ -0,0 +1,183 @@ +/****************************************************************************** + * drivers/xen/blktap/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + + */ + +#include "common.h" +#include +#include + +static kmem_cache_t *blkif_cachep; + +blkif_t *tap_alloc_blkif(domid_t domid) +{ + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + blkif->st_print = jiffies; + init_waitqueue_head(&blkif->waiting_to_free); + + return blkif; +} + +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + int ret; + + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, shared_page, blkif->domid); + + gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op); + + if (op.status == GNTST_okay) { + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + ret = 0; + } else { + DPRINTK("Grant table operation failure %d!\n", (int)op.status); + ret = -EINVAL; + } + + return ret; +} + +static void unmap_frontend_page(blkif_t *blkif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, blkif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, + unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler( + blkif->domid, evtchn, tap_blkif_be_int, + 0, "blkif-backend", blkif); + if (err < 0) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +void tap_blkif_unmap(blkif_t *blkif) +{ + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + if (blkif->blk_rings.common.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + } +} + +void tap_blkif_free(blkif_t *blkif) +{ + atomic_dec(&blkif->refcnt); + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); + atomic_inc(&blkif->refcnt); + + tap_blkif_unmap(blkif); +} + +void tap_blkif_kmem_cache_free(blkif_t *blkif) +{ + if (!atomic_dec_and_test(&blkif->refcnt)) + BUG(); + kmem_cache_free(blkif_cachep, blkif); +} + +void __init tap_blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap/xenbus.c 2010-11-25 09:36:37.000000000 +0100 @@ -0,0 +1,508 @@ +/* drivers/xen/blktap/xenbus.c + * + * Xenbus code for blktap + * + * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield + * + * Based on the blkback xenbus code: + * + * Copyright (C) 2005 Rusty Russell + * Copyright (C) 2005 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "common.h" + + +struct backend_info +{ + struct xenbus_device *dev; + blkif_t *blkif; + struct xenbus_watch backend_watch; + int xenbus_id; + int group_added; +}; + +static DEFINE_RWLOCK(sysfs_read_lock); + +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static int blktap_remove(struct xenbus_device *dev); +static int blktap_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id); +static void tap_backend_changed(struct xenbus_watch *, const char **, + unsigned int); +static void tap_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state); + +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + +static long get_id(const char *str) +{ + int len,end; + const char *ptr; + char *tptr, num[10]; + + len = strsep_len(str, '/', 2); + end = strlen(str); + if ( (len < 0) || (end < 0) ) return -1; + + ptr = str + len + 1; + strncpy(num,ptr,end - len); + tptr = num + (end - (len + 1)); + *tptr = '\0'; + DPRINTK("Get_id called for %s (%s)\n",str,num); + + return simple_strtol(num, NULL, 10); +} + +static int blktap_name(blkif_t *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + if ((devname = strstr(devpath, "/dev/")) != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +/**************************************************************** + * sysfs interface for I/O requests of blktap device + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + ssize_t ret = -ENODEV; \ + struct xenbus_device *dev; \ + struct backend_info *be; \ + \ + if (!get_device(_dev)) \ + return ret; \ + dev = to_xenbus_device(_dev); \ + read_lock(&sysfs_read_lock); \ + if ((be = dev->dev.driver_data) != NULL) \ + ret = sprintf(buf, format, ##args); \ + read_unlock(&sysfs_read_lock); \ + put_device(_dev); \ + return ret; \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); + +static struct attribute *tapstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group tapstat_group = { + .name = "statistics", + .attrs = tapstat_attrs, +}; + +int xentap_sysfs_addif(struct xenbus_device *dev) +{ + int err; + struct backend_info *be = dev->dev.driver_data; + err = sysfs_create_group(&dev->dev.kobj, &tapstat_group); + if (!err) + be->group_added = 1; + return err; +} + +void xentap_sysfs_delif(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + sysfs_remove_group(&dev->dev.kobj, &tapstat_group); + be->group_added = 0; +} + +static int blktap_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->dev.driver_data; + + write_lock(&sysfs_read_lock); + if (be->group_added) + xentap_sysfs_delif(be->dev); + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + if (be->blkif) { + if (be->blkif->xenblkd) + kthread_stop(be->blkif->xenblkd); + signal_tapdisk(be->blkif->dev_num); + tap_blkif_free(be->blkif); + tap_blkif_kmem_cache_free(be->blkif); + be->blkif = NULL; + } + kfree(be); + dev->dev.driver_data = NULL; + write_unlock(&sysfs_read_lock); + return 0; +} + +static void tap_update_blkif_status(blkif_t *blkif) +{ + int err; + char name[TASK_COMM_LEN]; + + /* Not ready to connect? */ + if(!blkif->irq || !blkif->sectors) { + return; + } + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blktap_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blktap dev name"); + return; + } + + if (!blkif->be->group_added) { + err = xentap_sysfs_addif(blkif->be->dev); + if (err) { + xenbus_dev_fatal(blkif->be->dev, err, + "creating sysfs entries"); + return; + } + } + + blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd"); + WPRINTK("Error starting thread %s\n", name); + } else + DPRINTK("Thread started for domid %d, connected disk %d\n", + blkif->domid, blkif->dev_num); + +} + +/** + * Entry point to this code when a new device is created. Allocate + * the basic structures, and watch the store waiting for the + * user-space program to tell us the physical device info. Switch to + * InitWait. + */ +static int blktap_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + + be->dev = dev; + dev->dev.driver_data = be; + be->xenbus_id = get_id(dev->nodename); + + be->blkif = tap_alloc_blkif(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + be->blkif->sectors = 0; + + /* set a watch on disk info, waiting for userspace to update details*/ + err = xenbus_watch_path2(dev, dev->nodename, "info", + &be->backend_watch, tap_backend_changed); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + return 0; + +fail: + DPRINTK("blktap probe failed\n"); + blktap_remove(dev); + return err; +} + + +/** + * Callback received when the user space code has placed the device + * information in xenstore. + */ +static void tap_backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned long info; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + + /** + * Check to see whether userspace code has opened the image + * and written sector + * and disk info to xenstore + */ + err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, + NULL); + if (XENBUS_EXIST_ERR(err)) + return; + if (err) { + xenbus_dev_error(dev, err, "getting info"); + return; + } + + DPRINTK("Userspace update on disk info, %lu\n",info); + + err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", + &be->blkif->sectors, NULL); + + /* Associate tap dev with domid*/ + be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, + be->blkif); + + tap_update_blkif_status(be->blkif); +} + + +static void blkif_disconnect(blkif_t *blkif) +{ + if (blkif->xenblkd) { + kthread_stop(blkif->xenblkd); + blkif->xenblkd = NULL; + } + + /* idempotent */ + tap_blkif_free(blkif); +} + +/** + * Callback received when the frontend's state changes. + */ +static void tap_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev->dev.driver_data; + int err; + + DPRINTK("fe_changed(%s,%d)\n", dev->nodename, frontend_state); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + printk(KERN_INFO "%s: %s: prepare for reconnect\n", + __FUNCTION__, dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* Ensure we connect even when two watches fire in + close successsion and we miss the intermediate value + of frontend_state. */ + if (dev->state == XenbusStateConnected) + break; + + /* Enforce precondition before potential leak point. + * blkif_disconnect() is idempotent. + */ + blkif_disconnect(be->blkif); + + err = connect_ring(be); + if (err) + break; + tap_update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + blkif_disconnect(be->blkif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + /* Implies the effects of blkif_disconnect() via + * blktap_remove(). + */ + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/** + * Switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + int err; + + struct xenbus_device *dev = be->dev; + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "switching to Connected state", + dev->nodename); + + return; +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + char protocol[64]; + int err; + + DPRINTK("%s\n", dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", + &ring_ref, "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming native"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + printk(KERN_INFO + "blktap: ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); + + /* Map the shared frame, irq etc. */ + err = tap_blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id blktap_ids[] = { + { "tap" }, + { "" } +}; + + +static struct xenbus_driver blktap = { + .name = "tap", + .owner = THIS_MODULE, + .ids = blktap_ids, + .probe = blktap_probe, + .remove = blktap_remove, + .otherend_changed = tap_frontend_changed +}; + + +void tap_blkif_xenbus_init(void) +{ + xenbus_register_backend(&blktap); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/Makefile 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap.o + +blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/blktap.h 2010-02-24 13:13:46.000000000 +0100 @@ -0,0 +1,254 @@ +#ifndef _BLKTAP_H_ +#define _BLKTAP_H_ + +#include +#include +#include +#include +#include +#include +#include + +//#define ENABLE_PASSTHROUGH + +extern int blktap_debug_level; + +#define BTPRINTK(level, tag, force, _f, _a...) \ + do { \ + if (blktap_debug_level > level && \ + (force || printk_ratelimit())) \ + printk(tag "%s: " _f, __func__, ##_a); \ + } while (0) + +#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) +#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) +#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) +#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) + +#define MAX_BLKTAP_DEVICE 256 + +#define BLKTAP_CONTROL 1 +#define BLKTAP_RING_FD 2 +#define BLKTAP_RING_VMA 3 +#define BLKTAP_DEVICE 4 +#define BLKTAP_SYSFS 5 +#define BLKTAP_PAUSE_REQUESTED 6 +#define BLKTAP_PAUSED 7 +#define BLKTAP_SHUTDOWN_REQUESTED 8 +#define BLKTAP_PASSTHROUGH 9 +#define BLKTAP_DEFERRED 10 + +/* blktap IOCTLs: */ +#define BLKTAP2_IOCTL_KICK_FE 1 +#define BLKTAP2_IOCTL_ALLOC_TAP 200 +#define BLKTAP2_IOCTL_FREE_TAP 201 +#define BLKTAP2_IOCTL_CREATE_DEVICE 202 +#define BLKTAP2_IOCTL_SET_PARAMS 203 +#define BLKTAP2_IOCTL_PAUSE 204 +#define BLKTAP2_IOCTL_REOPEN 205 +#define BLKTAP2_IOCTL_RESUME 206 + +#define BLKTAP2_MAX_MESSAGE_LEN 256 + +#define BLKTAP2_RING_MESSAGE_PAUSE 1 +#define BLKTAP2_RING_MESSAGE_RESUME 2 +#define BLKTAP2_RING_MESSAGE_CLOSE 3 + +#define BLKTAP_REQUEST_FREE 0 +#define BLKTAP_REQUEST_PENDING 1 + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req, _seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +#define blktap_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blktap_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->wq); \ + } while (0) + +struct blktap; + +struct grant_handle_pair { + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +struct blktap_handle { + unsigned int ring; + unsigned int device; + unsigned int minor; +}; + +struct blktap_params { + char name[BLKTAP2_MAX_MESSAGE_LEN]; + unsigned long long capacity; + unsigned long sector_size; +}; + +struct blktap_device { + int users; + spinlock_t lock; + struct gendisk *gd; + +#ifdef ENABLE_PASSTHROUGH + struct block_device *bdev; +#endif +}; + +struct blktap_ring { + struct vm_area_struct *vma; + blkif_front_ring_t ring; + struct vm_foreign_map foreign_map; + unsigned long ring_vstart; + unsigned long user_vstart; + + int response; + + wait_queue_head_t poll_wait; + + dev_t devno; + struct class_device *dev; + atomic_t sysfs_refcnt; + struct mutex sysfs_mutex; +}; + +struct blktap_statistics { + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + s64 st_rd_cnt; + s64 st_rd_sum_usecs; + s64 st_rd_max_usecs; + s64 st_wr_cnt; + s64 st_wr_sum_usecs; + s64 st_wr_max_usecs; +}; + +struct blktap_request { + uint64_t id; + uint16_t usr_idx; + + uint8_t status; + atomic_t pendcnt; + uint8_t nr_pages; + unsigned short operation; + + struct timeval time; + struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct list_head free_list; +}; + +struct blktap { + int minor; + pid_t pid; + atomic_t refcnt; + unsigned long dev_inuse; + + struct blktap_params params; + + struct rw_semaphore tap_sem; + + struct blktap_ring ring; + struct blktap_device device; + + int pending_cnt; + struct blktap_request *pending_requests[MAX_PENDING_REQS]; + struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + + wait_queue_head_t wq; + struct list_head deferred_queue; + + struct blktap_statistics stats; +}; + +extern struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static inline int +blktap_active(struct blktap *tap) +{ + return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse); +} + +static inline int +blktap_validate_params(struct blktap *tap, struct blktap_params *params) +{ + /* TODO: sanity check */ + params->name[sizeof(params->name) - 1] = '\0'; + BTINFO("%s: capacity: %llu, sector-size: %lu\n", + params->name, params->capacity, params->sector_size); + return 0; +} + +int blktap_control_destroy_device(struct blktap *); + +int blktap_ring_init(int *); +int blktap_ring_free(void); +int blktap_ring_create(struct blktap *); +int blktap_ring_destroy(struct blktap *); +int blktap_ring_pause(struct blktap *); +int blktap_ring_resume(struct blktap *); +void blktap_ring_kick_user(struct blktap *); + +int blktap_sysfs_init(void); +void blktap_sysfs_free(void); +int blktap_sysfs_create(struct blktap *); +int blktap_sysfs_destroy(struct blktap *); + +int blktap_device_init(int *); +void blktap_device_free(void); +int blktap_device_create(struct blktap *); +int blktap_device_destroy(struct blktap *); +int blktap_device_pause(struct blktap *); +int blktap_device_resume(struct blktap *); +void blktap_device_restart(struct blktap *); +void blktap_device_finish_request(struct blktap *, + blkif_response_t *, + struct blktap_request *); +void blktap_device_fail_pending_requests(struct blktap *); +#ifdef ENABLE_PASSTHROUGH +int blktap_device_enable_passthrough(struct blktap *, + unsigned, unsigned); +#endif + +void blktap_defer(struct blktap *); +void blktap_run_deferred(void); + +int blktap_request_pool_init(void); +void blktap_request_pool_free(void); +int blktap_request_pool_grow(void); +int blktap_request_pool_shrink(void); +struct blktap_request *blktap_request_allocate(struct blktap *); +void blktap_request_free(struct blktap *, struct blktap_request *); +struct page *request_to_page(struct blktap_request *, int); + +static inline unsigned long +request_to_kaddr(struct blktap_request *req, int seg) +{ + unsigned long pfn = page_to_pfn(request_to_page(req, seg)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/control.c 2010-04-29 09:34:47.000000000 +0200 @@ -0,0 +1,277 @@ +#include +#include + +#include "blktap.h" + +static DEFINE_SPINLOCK(blktap_control_lock); +struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static int ring_major; +static int device_major; +static int blktap_control_registered; + +static void +blktap_control_initialize_tap(struct blktap *tap) +{ + int minor = tap->minor; + + memset(tap, 0, sizeof(*tap)); + set_bit(BLKTAP_CONTROL, &tap->dev_inuse); + init_rwsem(&tap->tap_sem); + init_waitqueue_head(&tap->wq); + atomic_set(&tap->refcnt, 0); + + tap->minor = minor; +} + +static struct blktap * +blktap_control_create_tap(void) +{ + int minor; + struct blktap *tap; + + tap = kmalloc(sizeof(*tap), GFP_KERNEL); + if (unlikely(!tap)) + return NULL; + + blktap_control_initialize_tap(tap); + + spin_lock_irq(&blktap_control_lock); + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) + if (!blktaps[minor]) + break; + + if (minor == MAX_BLKTAP_DEVICE) { + kfree(tap); + tap = NULL; + goto out; + } + + tap->minor = minor; + blktaps[minor] = tap; + +out: + spin_unlock_irq(&blktap_control_lock); + return tap; +} + +static struct blktap * +blktap_control_allocate_tap(void) +{ + int err, minor; + struct blktap *tap; + + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&blktap_control_lock); + + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) { + tap = blktaps[minor]; + if (!tap) + goto found; + + if (!tap->dev_inuse) { + blktap_control_initialize_tap(tap); + goto found; + } + } + + tap = NULL; + +found: + spin_unlock_irq(&blktap_control_lock); + + if (!tap) { + tap = blktap_control_create_tap(); + if (!tap) + return NULL; + } + + err = blktap_ring_create(tap); + if (err) { + BTERR("ring creation failed: %d\n", err); + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + return NULL; + } + + BTINFO("allocated tap %p\n", tap); + return tap; +} + +static int +blktap_control_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + unsigned long dev; + struct blktap *tap; + + switch (cmd) { + case BLKTAP2_IOCTL_ALLOC_TAP: { + struct blktap_handle h; + + tap = blktap_control_allocate_tap(); + if (!tap) { + BTERR("error allocating device\n"); + return -ENOMEM; + } + + h.ring = ring_major; + h.device = device_major; + h.minor = tap->minor; + + if (copy_to_user((struct blktap_handle __user *)arg, + &h, sizeof(h))) { + blktap_control_destroy_device(tap); + return -EFAULT; + } + + return 0; + } + + case BLKTAP2_IOCTL_FREE_TAP: + dev = arg; + + if (dev >= MAX_BLKTAP_DEVICE || !blktaps[dev]) + return -EINVAL; + + blktap_control_destroy_device(blktaps[dev]); + return 0; + } + + return -ENOIOCTLCMD; +} + +static struct file_operations blktap_control_file_operations = { + .owner = THIS_MODULE, + .ioctl = blktap_control_ioctl, +}; + +static struct miscdevice blktap_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "blktap-control", + .fops = &blktap_control_file_operations, +}; + +int +blktap_control_destroy_device(struct blktap *tap) +{ + int err; + unsigned long inuse; + + if (!tap) + return 0; + + set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + for (;;) { + inuse = tap->dev_inuse; + err = blktap_device_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_ring_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_sysfs_destroy(tap); + if (err) + goto wait; + + break; + + wait: + BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n", + inuse, tap->dev_inuse); + if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse)) + break; + } + + clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) { + err = 0; + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + } + + return err; +} + +static int __init +blktap_control_init(void) +{ + int err; + + err = misc_register(&blktap_misc); + if (err) { + BTERR("misc_register failed for control device"); + return err; + } + + blktap_control_registered = 1; + return 0; +} + +static void +blktap_control_free(void) +{ + int i; + + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) + blktap_control_destroy_device(blktaps[i]); + + if (blktap_control_registered) + if (misc_deregister(&blktap_misc) < 0) + BTERR("misc_deregister failed for control device"); +} + +static void +blktap_exit(void) +{ + blktap_control_free(); + blktap_ring_free(); + blktap_sysfs_free(); + blktap_device_free(); + blktap_request_pool_free(); +} + +static int __init +blktap_init(void) +{ + int err; + + err = blktap_request_pool_init(); + if (err) + return err; + + err = blktap_device_init(&device_major); + if (err) + goto fail; + + err = blktap_ring_init(&ring_major); + if (err) + goto fail; + + err = blktap_sysfs_init(); + if (err) + goto fail; + + err = blktap_control_init(); + if (err) + goto fail; + + return 0; + +fail: + blktap_exit(); + return err; +} + +module_init(blktap_init); +module_exit(blktap_exit); +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/device.c 2010-11-25 09:36:37.000000000 +0100 @@ -0,0 +1,1191 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "blktap.h" + +#include "../blkback/blkback-pagemap.h" + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct blktap_grant_table { + int cnt; + struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; +}; + +static int blktap_device_major; + +static inline struct blktap * +dev_to_blktap(struct blktap_device *dev) +{ + return container_of(dev, struct blktap, device); +} + +static int +blktap_device_open(struct inode *inode, struct file *filep) +{ + struct blktap *tap; + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + + if (!dev) + return -ENOENT; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap) || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -ENOENT; + + dev->users++; + + return 0; +} + +static int +blktap_device_release(struct inode *inode, struct file *filep) +{ + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + struct blktap *tap = dev_to_blktap(dev); + + dev->users--; + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + blktap_device_destroy(tap); + + return 0; +} + +static int +blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + +static int +blktap_device_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + int i; + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blktap_device_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif + case CDROMMULTISESSION: + BTDBG("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_IDLUN: + if (!access_ok(VERIFY_WRITE, argument, + sizeof(struct scsi_idlun))) + return -EFAULT; + + /* return 0 for now. */ + __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); + __put_user(0, + &((struct scsi_idlun __user *)argument)->host_unique_id); + return 0; + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + +static struct block_device_operations blktap_device_file_operations = { + .owner = THIS_MODULE, + .open = blktap_device_open, + .release = blktap_device_release, + .ioctl = blktap_device_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .getgeo = blktap_device_getgeo +#endif +}; + +static int +blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + pte_t *pte = (pte_t *)data; + + BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte)); + set_pte(ptep, *pte); + return 0; +} + +static int +blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_map_uaddr_fn, &pte); +} + +static int +blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + struct mm_struct *mm = (struct mm_struct *)data; + + BTDBG("ptep %p\n", ptep); + pte_clear(mm, addr, ptep); + return 0; +} + +static int +blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_umap_uaddr_fn, mm); +} + +static inline void +flush_tlb_kernel_page(unsigned long kvaddr) +{ +#ifdef CONFIG_X86 + xen_invlpg_all(kvaddr); +#else + flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); +#endif +} + +static void +blktap_device_end_dequeued_request(struct blktap_device *dev, + struct request *req, int uptodate) +{ + int ret; + + ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); + BUG_ON(ret); + + spin_lock_irq(&dev->lock); + end_that_request_last(req, uptodate); + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request) +{ + uint64_t ptep; + int ret, usr_idx; + unsigned int i, cnt; + struct page **map, *page; + struct blktap_ring *ring; + struct grant_handle_pair *khandle; + unsigned long kvaddr, uvaddr, offset; + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; + grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int self_gref_nr = 0; + + cnt = 0; + ring = &tap->ring; + usr_idx = request->usr_idx; + map = ring->foreign_map.map; + + if (!ring->vma) + return; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); + + for (i = 0; i < request->nr_pages; i++) { + kvaddr = request_to_kaddr(request, i); + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + + khandle = request->handles + i; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[cnt], kvaddr, + GNTMAP_host_map, khandle->kernel); + cnt++; + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (create_lookup_pte_addr(ring->vma->vm_mm, + uvaddr, &ptep) != 0) { + BTERR("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[cnt], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + cnt++; + } + + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + + BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, " + "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: " + "0x%08lx, handle: %u\n", offset, map[offset], request, + usr_idx, i, kvaddr, khandle->kernel, uvaddr, + khandle->user); + + page = map[offset]; + if (page) { + if (PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } else if ( + xen_feature(XENFEAT_auto_translated_physmap)) { + self_gref[self_gref_nr] = khandle->kernel; + self_gref_nr++; + } + } + map[offset] = NULL; + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + } + + if (cnt) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, cnt); + BUG_ON(ret); + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); + else { + for (i = 0; i < self_gref_nr; i++) { + gnttab_end_foreign_access_ref(self_gref[i]); + } + } +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_unmap(struct blktap *tap, struct blktap_request *request) +{ + int i, usr_idx; + unsigned long kvaddr; + + usr_idx = request->usr_idx; + down_write(&tap->ring.vma->vm_mm->mmap_sem); + + for (i = 0; i < request->nr_pages; i++) { + kvaddr = request_to_kaddr(request, i); + BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, " + "uvaddr: 0x%08lx, uhandle: %u\n", request, i, + kvaddr, request->handles[i].kernel, + MMAP_VADDR(tap->ring.user_vstart, usr_idx, i), + request->handles[i].user); + + if (!xen_feature(XENFEAT_auto_translated_physmap) && + request->handles[i].kernel == INVALID_GRANT_HANDLE) { + blktap_umap_uaddr(&init_mm, kvaddr); + flush_tlb_kernel_page(kvaddr); + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + } + + blktap_device_fast_flush(tap, request); + up_write(&tap->ring.vma->vm_mm->mmap_sem); +} + +/* + * called if the tapdisk process dies unexpectedly. + * fail and release any pending requests and disable queue. + */ +void +blktap_device_fail_pending_requests(struct blktap *tap) +{ + int usr_idx; + struct request *req; + struct blktap_device *dev; + struct blktap_request *request; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return; + + down_write(&tap->tap_sem); + + dev = &tap->device; + for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { + request = tap->pending_requests[usr_idx]; + if (!request || request->status != BLKTAP_REQUEST_PENDING) + continue; + + BTERR("%u:%u: failing pending %s of %d pages\n", + blktap_device_major, tap->minor, + (request->operation == BLKIF_OP_READ ? + "read" : "write"), request->nr_pages); + + blktap_unmap(tap, request); + req = (struct request *)(unsigned long)request->id; + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + up_write(&tap->tap_sem); + + spin_lock_irq(&dev->lock); + + /* fail any future requests */ + dev->gd->queue->queuedata = NULL; + blk_start_queue(dev->gd->queue); + + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +void +blktap_device_finish_request(struct blktap *tap, + blkif_response_t *res, + struct blktap_request *request) +{ + int uptodate; + struct request *req; + struct blktap_device *dev; + + dev = &tap->device; + + blktap_unmap(tap, request); + + req = (struct request *)(unsigned long)request->id; + uptodate = (res->status == BLKIF_RSP_OKAY); + + BTDBG("req %p res status %d operation %d/%d id %lld\n", req, + res->status, res->operation, request->operation, + (unsigned long long)res->id); + + switch (request->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(res->status != BLKIF_RSP_OKAY)) + BTERR("Bad return from device data " + "request: %x\n", res->status); + blktap_device_end_dequeued_request(dev, req, uptodate); + break; + default: + BUG(); + } + + blktap_request_free(tap, request); +} + +static int +blktap_prep_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + unsigned int seg, struct page *page, + struct blktap_grant_table *table) +{ + uint64_t ptep; + uint32_t flags; + struct page *tap_page; + struct blktap_ring *ring; + struct blkback_pagemap map; + unsigned long uvaddr, kvaddr; + + ring = &tap->ring; + map = blkback_pagemap_read(page); + blkif_req->seg[seg].gref = map.gref; + + uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + flags = GNTMAP_host_map | + (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0); + + gnttab_set_map_op(&table->grants[table->cnt], + kvaddr, flags, map.gref, map.domid); + table->cnt++; + + /* enable chained tap devices */ + tap_page = request_to_page(request, seg); + set_page_private(tap_page, page_private(page)); + SetPageBlkback(tap_page); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) { + BTERR("couldn't get a pte addr!\n"); + return -1; + } + + flags |= GNTMAP_application_map | GNTMAP_contains_pte; + gnttab_set_map_op(&table->grants[table->cnt], + ptep, flags, map.gref, map.domid); + table->cnt++; + + return 0; +} + +static int +blktap_map_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + struct blktap_grant_table *table) +{ + struct page *page; + int i, grant, err, usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, foreign_mfn; + + if (!table->cnt) + return 0; + + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + table->grants, table->cnt); + BUG_ON(err); + + grant = 0; + usr_idx = request->usr_idx; + ring = &tap->ring; + + for (i = 0; i < request->nr_pages; i++) { + if (!blkif_req->seg[i].gref) + continue; + + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + + if (unlikely(table->grants[grant].status != GNTST_okay)) { + BTERR("invalid kernel buffer: could not remap it\n"); + /* This should never happen: blkback should handle eagain first */ + BUG_ON(table->grants[grant].status == GNTST_eagain); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + + request->handles[i].kernel = table->grants[grant].handle; + foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT; + grant++; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (unlikely(table->grants[grant].status != GNTST_okay)) { + /* This should never happen: blkback should handle eagain first */ + WARN_ON(table->grants[grant].status == GNTST_eagain); + BTERR("invalid user buffer: could not remap it\n"); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + request->handles[i].user = table->grants[grant].handle; + grant++; + } + + if (err) + continue; + + page = request_to_page(request, i); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + set_phys_to_machine(page_to_pfn(page), + FOREIGN_FRAME(foreign_mfn)); + else if (vm_insert_page(ring->vma, uvaddr, page)) + err |= 1; + + BTDBG("pending_req: %p, seg: %d, page: %p, " + "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, " + "uhandle: %u\n", request, i, page, + pfn_to_kaddr(page_to_pfn(page)), + request->handles[i].kernel, + uvaddr, request->handles[i].user); + } + + return err; +} + +static int +blktap_map(struct blktap *tap, + struct blktap_request *request, + unsigned int seg, struct page *page) +{ + pte_t pte; + int usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, kvaddr; + int err = 0; + + ring = &tap->ring; + usr_idx = request->usr_idx; + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + pte = mk_pte(page, ring->vma->vm_page_prot); + blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); + flush_tlb_page(ring->vma, uvaddr); + blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); + flush_tlb_kernel_page(kvaddr); + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); + request->handles[seg].kernel = INVALID_GRANT_HANDLE; + } else { + /* grant this page access to self domain and map it. */ + domid_t domid = 0; /* XXX my domian id: grant table hypercall + doesn't understand DOMID_SELF */ + int gref; + uint32_t flags; + struct gnttab_map_grant_ref map; + struct page *tap_page; + + gref = gnttab_grant_foreign_access( + domid, page_to_pfn(page), + (request->operation == BLKIF_OP_WRITE)? + GTF_readonly: 0); + + flags = GNTMAP_host_map | + (request->operation == BLKIF_OP_WRITE ? + GNTMAP_readonly : 0); + + gnttab_set_map_op(&map, kvaddr, flags, gref, domid); + + /* enable chained tap devices */ + tap_page = request_to_page(request, seg); + set_page_private(tap_page, page_private(page)); + SetPageBlkback(tap_page); + + gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map); + + /* We are not expecting the grant op to fail */ + BUG_ON(map.status != GNTST_okay); + + err = vm_insert_page(ring->vma, uvaddr, tap_page); + if (err) { + struct gnttab_unmap_grant_ref unmap; + gnttab_set_unmap_op(&unmap, kvaddr, + GNTMAP_host_map, gref); + VOID(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, &unmap, 1)); + } else + request->handles[seg].kernel = gref; + } + request->handles[seg].user = INVALID_GRANT_HANDLE; + + BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, " + "uvaddr: 0x%08lx\n", request, seg, page, kvaddr, + uvaddr); + + return err; +} + +static int +blktap_device_process_request(struct blktap *tap, + struct blktap_request *request, + struct request *req) +{ + struct page *page; + int i, usr_idx, err; + struct blktap_ring *ring; + struct scatterlist *sg; + struct blktap_grant_table table; + unsigned int fsect, lsect, nr_sects; + unsigned long offset, uvaddr; + struct blkif_request blkif_req, *target; + + err = -1; + memset(&table, 0, sizeof(table)); + + if (!blktap_active(tap)) + goto out; + + ring = &tap->ring; + usr_idx = request->usr_idx; + blkif_req.id = usr_idx; + blkif_req.sector_number = (blkif_sector_t)req->sector; + blkif_req.handle = 0; + blkif_req.operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + + request->id = (unsigned long)req; + request->operation = blkif_req.operation; + request->status = BLKTAP_REQUEST_PENDING; + do_gettimeofday(&request->time); + + nr_sects = 0; + request->nr_pages = 0; + blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg); + BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + for (i = 0; i < blkif_req.nr_segments; ++i) { + sg = tap->sg + i; + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + nr_sects += sg->length >> 9; + + blkif_req.seg[i] = + (struct blkif_request_segment) { + .gref = 0, + .first_sect = fsect, + .last_sect = lsect }; + + if (PageBlkback(sg->page)) { + /* foreign page -- use xen */ + if (blktap_prep_foreign(tap, + request, + &blkif_req, + i, + sg->page, + &table)) + goto out; + } else { + /* do it the old fashioned way */ + if (blktap_map(tap, + request, + i, + sg->page)) + goto out; + } + + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + page = request_to_page(request, i); + ring->foreign_map.map[offset] = page; + SetPageReserved(page); + + BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n", + uvaddr, page, page_to_pfn(page)); + BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, " + "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n", + offset, request, i, + page, pfn_to_kaddr(page_to_pfn(page)), uvaddr); + + request->nr_pages++; + } + + if (blktap_map_foreign(tap, request, &blkif_req, &table)) + goto out; + + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); + memcpy(target, &blkif_req, sizeof(blkif_req)); + target->id = request->usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ + ring->ring.req_prod_pvt++; + + if (rq_data_dir(req)) { + tap->stats.st_wr_sect += nr_sects; + tap->stats.st_wr_req++; + } else { + tap->stats.st_rd_sect += nr_sects; + tap->stats.st_rd_req++; + } + + err = 0; + +out: + if (err) + blktap_device_fast_flush(tap, request); + return err; +} + +#ifdef ENABLE_PASSTHROUGH +#define rq_for_each_bio_safe(_bio, _tmp, _req) \ + if ((_req)->bio) \ + for (_bio = (_req)->bio; \ + _bio && ((_tmp = _bio->bi_next) || 1); \ + _bio = _tmp) + +static void +blktap_device_forward_request(struct blktap *tap, struct request *req) +{ + struct bio *bio, *tmp; + struct blktap_device *dev; + + dev = &tap->device; + + rq_for_each_bio_safe(bio, tmp, req) { + bio->bi_bdev = dev->bdev; + submit_bio(bio->bi_rw, bio); + } +} + +static void +blktap_device_close_bdev(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + + if (dev->bdev) + blkdev_put(dev->bdev); + + dev->bdev = NULL; + clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); +} + +static int +blktap_device_open_bdev(struct blktap *tap, u32 pdev) +{ + struct block_device *bdev; + struct blktap_device *dev; + + dev = &tap->device; + + bdev = open_by_devnum(pdev, FMODE_WRITE); + if (IS_ERR(bdev)) { + BTERR("opening device %x:%x failed: %ld\n", + MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev)); + return PTR_ERR(bdev); + } + + if (!bdev->bd_disk) { + BTERR("device %x:%x doesn't exist\n", + MAJOR(pdev), MINOR(pdev)); + blkdev_put(dev->bdev); + return -ENOENT; + } + + dev->bdev = bdev; + set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); + + /* TODO: readjust queue parameters */ + + BTINFO("set device %d to passthrough on %x:%x\n", + tap->minor, MAJOR(pdev), MINOR(pdev)); + + return 0; +} + +int +blktap_device_enable_passthrough(struct blktap *tap, + unsigned major, unsigned minor) +{ + u32 pdev; + struct blktap_device *dev; + + dev = &tap->device; + pdev = MKDEV(major, minor); + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (dev->bdev) { + if (pdev) + return -EINVAL; + blktap_device_close_bdev(tap); + return 0; + } + + return blktap_device_open_bdev(tap, pdev); +} +#endif + +/* + * dev->lock held on entry + */ +static void +blktap_device_run_queue(struct blktap *tap) +{ + int queued, err; + request_queue_t *rq; + struct request *req; + struct blktap_ring *ring; + struct blktap_device *dev; + struct blktap_request *request; + + queued = 0; + ring = &tap->ring; + dev = &tap->device; + rq = dev->gd->queue; + + BTDBG("running queue for %d\n", tap->minor); + + while ((req = elv_next_request(rq)) != NULL) { + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (blk_barrier_rq(req)) { + end_request(req, 0); + continue; + } + +#ifdef ENABLE_PASSTHROUGH + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + blkdev_dequeue_request(req); + blktap_device_forward_request(tap, req); + continue; + } +#endif + + if (RING_FULL(&ring->ring)) { + wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + blktap_defer(tap); + break; + } + + request = blktap_request_allocate(tap); + if (!request) { + tap->stats.st_oo_req++; + goto wait; + } + + BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) " + "buffer:%p [%s], pending: %p\n", req, tap->minor, + req->cmd, (unsigned long long)req->sector, + req->current_nr_sectors, req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read", request); + + blkdev_dequeue_request(req); + + spin_unlock_irq(&dev->lock); + down_write(&tap->tap_sem); + + err = blktap_device_process_request(tap, request, req); + if (!err) + queued++; + else { + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + up_write(&tap->tap_sem); + spin_lock_irq(&dev->lock); + } + + if (queued) + blktap_ring_kick_user(tap); +} + +/* + * dev->lock held on entry + */ +static void +blktap_device_do_request(request_queue_t *rq) +{ + struct request *req; + struct blktap *tap; + struct blktap_device *dev; + + dev = rq->queuedata; + if (!dev) + goto fail; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap)) + goto fail; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + blktap_device_run_queue(tap); + return; + +fail: + while ((req = elv_next_request(rq))) { + BTERR("device closed: failing secs %llu - %llu\n", + (unsigned long long)req->sector, + (unsigned long long)req->sector + req->nr_sectors); + end_request(req, 0); + } +} + +void +blktap_device_restart(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + + if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) { + blktap_defer(tap); + return; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + spin_lock_irq(&dev->lock); + + /* Re-enable calldowns. */ + if (dev->gd) { + struct request_queue *rq = dev->gd->queue; + + if (blk_queue_stopped(rq)) + blk_start_queue(rq); + + /* Kick things off immediately. */ + blktap_device_do_request(rq); + } + + spin_unlock_irq(&dev->lock); +} + +static void +blktap_device_configure(struct blktap *tap) +{ + struct request_queue *rq; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd) + return; + + dev = &tap->device; + rq = dev->gd->queue; + + spin_lock_irq(&dev->lock); + + set_capacity(dev->gd, tap->params.capacity); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, tap->params.sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + spin_unlock_irq(&dev->lock); +} + +int +blktap_device_resume(struct blktap *tap) +{ + int err; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + err = blktap_ring_resume(tap); + if (err) + return err; + + /* device size may have changed */ + blktap_device_configure(tap); + + BTDBG("restarting device\n"); + blktap_device_restart(tap); + + return 0; +} + +int +blktap_device_pause(struct blktap *tap) +{ + unsigned long flags; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + spin_lock_irqsave(&dev->lock, flags); + + blk_stop_queue(dev->gd->queue); + set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + spin_unlock_irqrestore(&dev->lock, flags); + + return blktap_ring_pause(tap); +} + +int +blktap_device_destroy(struct blktap *tap) +{ + struct blktap_device *dev = &tap->device; + struct gendisk *gd = dev->gd; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return 0; + + BTINFO("destroy device %d users %d\n", tap->minor, dev->users); + + if (dev->users) + return -EBUSY; + + spin_lock_irq(&dev->lock); + /* No more blktap_device_do_request(). */ + blk_stop_queue(gd->queue); + clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); + dev->gd = NULL; + spin_unlock_irq(&dev->lock); + +#ifdef ENABLE_PASSTHROUGH + if (dev->bdev) + blktap_device_close_bdev(tap); +#endif + + del_gendisk(gd); + blk_cleanup_queue(gd->queue); + put_disk(gd); + + wake_up(&tap->wq); + + return 0; +} + +int +blktap_device_create(struct blktap *tap) +{ + int minor, err; + struct gendisk *gd; + struct request_queue *rq; + struct blktap_device *dev; + + gd = NULL; + rq = NULL; + dev = &tap->device; + minor = tap->minor; + + if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return -EEXIST; + + if (blktap_validate_params(tap, &tap->params)) + return -EINVAL; + + BTINFO("minor %d sectors %Lu sector-size %lu\n", + minor, tap->params.capacity, tap->params.sector_size); + + err = -ENODEV; + + gd = alloc_disk(1); + if (!gd) + goto error; + + if (minor < 26) + sprintf(gd->disk_name, "tapdev%c", 'a' + minor); + else + sprintf(gd->disk_name, "tapdev%c%c", + 'a' + ((minor / 26) - 1), 'a' + (minor % 26)); + + gd->major = blktap_device_major; + gd->first_minor = minor; + gd->fops = &blktap_device_file_operations; + gd->private_data = dev; + + spin_lock_init(&dev->lock); + rq = blk_init_queue(blktap_device_do_request, &dev->lock); + if (!rq) + goto error; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif + + gd->queue = rq; + rq->queuedata = dev; + dev->gd = gd; + + set_bit(BLKTAP_DEVICE, &tap->dev_inuse); + blktap_device_configure(tap); + + add_disk(gd); + + err = 0; + goto out; + + error: + if (gd) + del_gendisk(gd); + if (rq) + blk_cleanup_queue(rq); + + out: + BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err); + return err; +} + +int __init +blktap_device_init(int *maj) +{ + int major; + + /* Dynamically allocate a major for this device */ + major = register_blkdev(0, "tapdev"); + if (major < 0) { + BTERR("Couldn't register blktap device\n"); + return -ENOMEM; + } + + blktap_device_major = *maj = major; + BTINFO("blktap device major %d\n", major); + + return 0; +} + +void +blktap_device_free(void) +{ + if (blktap_device_major) + if (unregister_blkdev(blktap_device_major, "tapdev")) + BTERR("blktap device unregister failed\n"); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/request.c 2010-01-04 11:56:34.000000000 +0100 @@ -0,0 +1,296 @@ +#include +#include + +#include "blktap.h" + +#define MAX_BUCKETS 8 +#define BUCKET_SIZE MAX_PENDING_REQS + +#define BLKTAP_POOL_CLOSING 1 + +struct blktap_request_bucket; + +struct blktap_request_handle { + int slot; + uint8_t inuse; + struct blktap_request request; + struct blktap_request_bucket *bucket; +}; + +struct blktap_request_bucket { + atomic_t reqs_in_use; + struct blktap_request_handle handles[BUCKET_SIZE]; + struct page **foreign_pages; +}; + +struct blktap_request_pool { + spinlock_t lock; + uint8_t status; + struct list_head free_list; + atomic_t reqs_in_use; + wait_queue_head_t wait_queue; + struct blktap_request_bucket *buckets[MAX_BUCKETS]; +}; + +static struct blktap_request_pool pool; + +static inline struct blktap_request_handle * +blktap_request_to_handle(struct blktap_request *req) +{ + return container_of(req, struct blktap_request_handle, request); +} + +static void +blktap_request_pool_init_request(struct blktap_request *request) +{ + int i; + + request->usr_idx = -1; + request->nr_pages = 0; + request->status = BLKTAP_REQUEST_FREE; + INIT_LIST_HEAD(&request->free_list); + for (i = 0; i < ARRAY_SIZE(request->handles); i++) { + request->handles[i].user = INVALID_GRANT_HANDLE; + request->handles[i].kernel = INVALID_GRANT_HANDLE; + } +} + +static int +blktap_request_pool_allocate_bucket(void) +{ + int i, idx; + unsigned long flags; + struct blktap_request *request; + struct blktap_request_handle *handle; + struct blktap_request_bucket *bucket; + + bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL); + if (!bucket) + goto fail; + + bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES); + if (!bucket->foreign_pages) + goto fail; + + spin_lock_irqsave(&pool.lock, flags); + + idx = -1; + for (i = 0; i < MAX_BUCKETS; i++) { + if (!pool.buckets[i]) { + idx = i; + pool.buckets[idx] = bucket; + break; + } + } + + if (idx == -1) { + spin_unlock_irqrestore(&pool.lock, flags); + goto fail; + } + + for (i = 0; i < BUCKET_SIZE; i++) { + handle = bucket->handles + i; + request = &handle->request; + + handle->slot = i; + handle->inuse = 0; + handle->bucket = bucket; + + blktap_request_pool_init_request(request); + list_add_tail(&request->free_list, &pool.free_list); + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return 0; + +fail: + if (bucket && bucket->foreign_pages) + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); + return -ENOMEM; +} + +static void +blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket) +{ + if (!bucket) + return; + + BTDBG("freeing bucket %p\n", bucket); + + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); +} + +struct page * +request_to_page(struct blktap_request *req, int seg) +{ + struct blktap_request_handle *handle = blktap_request_to_handle(req); + int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; + return handle->bucket->foreign_pages[idx]; +} + +int +blktap_request_pool_shrink(void) +{ + int i, err; + unsigned long flags; + struct blktap_request_bucket *bucket; + + err = -EAGAIN; + + spin_lock_irqsave(&pool.lock, flags); + + /* always keep at least one bucket */ + for (i = 1; i < MAX_BUCKETS; i++) { + bucket = pool.buckets[i]; + if (!bucket) + continue; + + if (atomic_read(&bucket->reqs_in_use)) + continue; + + blktap_request_pool_free_bucket(bucket); + pool.buckets[i] = NULL; + err = 0; + break; + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return err; +} + +int +blktap_request_pool_grow(void) +{ + return blktap_request_pool_allocate_bucket(); +} + +struct blktap_request * +blktap_request_allocate(struct blktap *tap) +{ + int i; + uint16_t usr_idx; + unsigned long flags; + struct blktap_request *request; + + usr_idx = -1; + request = NULL; + + spin_lock_irqsave(&pool.lock, flags); + + if (pool.status == BLKTAP_POOL_CLOSING) + goto out; + + for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++) + if (!tap->pending_requests[i]) { + usr_idx = i; + break; + } + + if (usr_idx == (uint16_t)-1) + goto out; + + if (!list_empty(&pool.free_list)) { + request = list_entry(pool.free_list.next, + struct blktap_request, free_list); + list_del(&request->free_list); + } + + if (request) { + struct blktap_request_handle *handle; + + atomic_inc(&pool.reqs_in_use); + + handle = blktap_request_to_handle(request); + atomic_inc(&handle->bucket->reqs_in_use); + handle->inuse = 1; + + request->usr_idx = usr_idx; + + tap->pending_requests[usr_idx] = request; + tap->pending_cnt++; + } + +out: + spin_unlock_irqrestore(&pool.lock, flags); + return request; +} + +void +blktap_request_free(struct blktap *tap, struct blktap_request *request) +{ + int free; + unsigned long flags; + struct blktap_request_handle *handle; + + BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests)); + handle = blktap_request_to_handle(request); + + spin_lock_irqsave(&pool.lock, flags); + + handle->inuse = 0; + tap->pending_requests[request->usr_idx] = NULL; + blktap_request_pool_init_request(request); + list_add(&request->free_list, &pool.free_list); + atomic_dec(&handle->bucket->reqs_in_use); + free = atomic_dec_and_test(&pool.reqs_in_use); + + spin_unlock_irqrestore(&pool.lock, flags); + + if (--tap->pending_cnt == 0) + wake_up_interruptible(&tap->wq); + + if (free) + wake_up(&pool.wait_queue); +} + +void +blktap_request_pool_free(void) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&pool.lock, flags); + + pool.status = BLKTAP_POOL_CLOSING; + while (atomic_read(&pool.reqs_in_use)) { + spin_unlock_irqrestore(&pool.lock, flags); + wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use)); + spin_lock_irqsave(&pool.lock, flags); + } + + for (i = 0; i < MAX_BUCKETS; i++) { + blktap_request_pool_free_bucket(pool.buckets[i]); + pool.buckets[i] = NULL; + } + + spin_unlock_irqrestore(&pool.lock, flags); +} + +int __init +blktap_request_pool_init(void) +{ + int i, err; + + memset(&pool, 0, sizeof(pool)); + + spin_lock_init(&pool.lock); + INIT_LIST_HEAD(&pool.free_list); + atomic_set(&pool.reqs_in_use, 0); + init_waitqueue_head(&pool.wait_queue); + + for (i = 0; i < 2; i++) { + err = blktap_request_pool_allocate_bucket(); + if (err) + goto fail; + } + + return 0; + +fail: + blktap_request_pool_free(); + return err; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/ring.c 2010-08-31 09:24:21.000000000 +0200 @@ -0,0 +1,610 @@ +#include +#include + +#include "blktap.h" + +static int blktap_ring_major; + +static inline struct blktap * +vma_to_blktap(struct vm_area_struct *vma) +{ + struct vm_foreign_map *m = vma->vm_private_data; + struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map); + return container_of(r, struct blktap, ring); +} + + /* + * BLKTAP - immediately before the mmap area, + * we have a bunch of pages reserved for shared memory rings. + */ +#define RING_PAGES 1 + +static int +blktap_read_ring(struct blktap *tap) +{ + /* This is called to read responses from the ring. */ + int usr_idx; + RING_IDX rc, rp; + blkif_response_t res; + struct blktap_ring *ring; + struct blktap_request *request; + + down_read(&tap->tap_sem); + + ring = &tap->ring; + if (!ring->vma) { + up_read(&tap->tap_sem); + return 0; + } + + /* for each outstanding message on the ring */ + rp = ring->ring.sring->rsp_prod; + rmb(); + + for (rc = ring->ring.rsp_cons; rc != rp; rc++) { + memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ + ++ring->ring.rsp_cons; + + usr_idx = (int)res.id; + if (usr_idx >= MAX_PENDING_REQS || + !tap->pending_requests[usr_idx]) { + BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n", + rc, rp, usr_idx, tap->pid, ring->vma); + continue; + } + + request = tap->pending_requests[usr_idx]; + BTDBG("request %p response #%d id %x\n", request, rc, usr_idx); + blktap_device_finish_request(tap, &res, request); + } + + up_read(&tap->tap_sem); + + blktap_run_deferred(); + + return 0; +} + +static struct page * +blktap_ring_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +static pte_t +blktap_ring_clear_pte(struct vm_area_struct *vma, + unsigned long uvaddr, + pte_t *ptep, int is_fullmm) +{ + pte_t copy; + struct blktap *tap; + unsigned long kvaddr; + struct page **map, *page; + struct blktap_ring *ring; + struct blktap_request *request; + struct grant_handle_pair *khandle; + struct gnttab_unmap_grant_ref unmap[2]; + int offset, seg, usr_idx, count = 0; + + tap = vma_to_blktap(vma); + ring = &tap->ring; + map = ring->foreign_map.map; + BUG_ON(!map); /* TODO Should this be changed to if statement? */ + + /* + * Zap entry if the address is before the start of the grant + * mapped region. + */ + if (uvaddr < ring->user_vstart) + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, + ptep, is_fullmm); + + offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); + usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; + seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; + + offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT); + page = map[offset]; + if (page && PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } + map[offset] = NULL; + + request = tap->pending_requests[usr_idx]; + kvaddr = request_to_kaddr(request, seg); + khandle = request->handles + seg; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[count], kvaddr, + GNTMAP_host_map, khandle->kernel); + count++; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + + copy = *ptep; + gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + count++; + } else + copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + + if (count) + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, count)) + BUG(); + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + + return copy; +} + +static void +blktap_ring_vm_unmap(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + + down_write(&tap->tap_sem); + clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + up_write(&tap->tap_sem); +} + +static void +blktap_ring_vm_close(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + struct blktap_ring *ring = &tap->ring; + + blktap_ring_vm_unmap(vma); /* fail future requests */ + blktap_device_fail_pending_requests(tap); /* fail pending requests */ + blktap_device_restart(tap); /* fail deferred requests */ + + down_write(&tap->tap_sem); + + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + + kfree(ring->foreign_map.map); + ring->foreign_map.map = NULL; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(ring->ring.sring)); + free_page((unsigned long)ring->ring.sring); + + BTINFO("unmapping ring %d\n", tap->minor); + ring->ring.sring = NULL; + ring->vma = NULL; + + up_write(&tap->tap_sem); + + wake_up(&tap->wq); +} + +static struct vm_operations_struct blktap_ring_vm_operations = { + .close = blktap_ring_vm_close, + .unmap = blktap_ring_vm_unmap, + .nopage = blktap_ring_nopage, + .zap_pte = blktap_ring_clear_pte, +}; + +static int +blktap_ring_open(struct inode *inode, struct file *filp) +{ + int idx; + struct blktap *tap; + + idx = iminor(inode); + if (idx < 0 || idx >= MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) { + BTERR("unable to open device blktap%d\n", idx); + return -ENODEV; + } + + tap = blktaps[idx]; + + BTINFO("opening device blktap%d\n", idx); + + if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse)) + return -ENODEV; + + /* Only one process can access ring at a time */ + if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse)) + return -EBUSY; + + filp->private_data = tap; + BTINFO("opened device %d\n", tap->minor); + + return 0; +} + +static int +blktap_ring_release(struct inode *inode, struct file *filp) +{ + struct blktap *tap = filp->private_data; + + BTINFO("freeing device %d\n", tap->minor); + clear_bit(BLKTAP_RING_FD, &tap->dev_inuse); + filp->private_data = NULL; + wake_up(&tap->wq); + return 0; +} + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size, err; + struct page **map; + struct blktap *tap; + blkif_sring_t *sring; + struct blktap_ring *ring; + + tap = filp->private_data; + ring = &tap->ring; + map = NULL; + sring = NULL; + + if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return -ENOMEM; + + size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (size != (MMAP_PAGES + RING_PAGES)) { + BTERR("you _must_ map exactly %lu pages!\n", + MMAP_PAGES + RING_PAGES); + return -EAGAIN; + } + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (!sring) { + BTERR("Couldn't alloc sring.\n"); + goto fail_mem; + } + + map = kzalloc(size * sizeof(struct page *), GFP_KERNEL); + if (!map) { + BTERR("Couldn't alloc VM_FOREIGN map.\n"); + goto fail_mem; + } + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); + + ring->ring_vstart = vma->vm_start; + ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + err = vm_insert_page(vma, vma->vm_start, + virt_to_page(ring->ring.sring)); + else + err = remap_pfn_range(vma, vma->vm_start, + __pa(ring->ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + BTERR("Mapping user ring failed: %d\n", err); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + ring->foreign_map.map = map; + vma->vm_private_data = &ring->foreign_map; + vma->vm_flags |= VM_FOREIGN; + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_ring_vm_operations; + +#ifdef CONFIG_X86 + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + tap->pid = current->pid; + BTINFO("blktap: mapping pid is %d\n", tap->pid); + + ring->vma = vma; + return 0; + + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + ClearPageReserved(virt_to_page(sring)); + fail_mem: + free_page((unsigned long)sring); + kfree(map); + + return -ENOMEM; +} + +static inline void +blktap_ring_set_message(struct blktap *tap, int msg) +{ + struct blktap_ring *ring = &tap->ring; + + down_read(&tap->tap_sem); + if (ring->ring.sring) + ring->ring.sring->private.tapif_user.msg = msg; + up_read(&tap->tap_sem); +} + +static int +blktap_ring_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct blktap_params params; + struct blktap *tap = filp->private_data; + + BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); + + switch(cmd) { + case BLKTAP2_IOCTL_KICK_FE: + /* There are fe messages to process. */ + return blktap_read_ring(tap); + + case BLKTAP2_IOCTL_CREATE_DEVICE: + if (!arg) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return blktap_device_create(tap); + + case BLKTAP2_IOCTL_SET_PARAMS: + if (!arg) + return -EINVAL; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return 0; + + case BLKTAP2_IOCTL_PAUSE: + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + set_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + + case BLKTAP2_IOCTL_REOPEN: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (!arg) + return -EINVAL; + + if (copy_to_user((char __user *)arg, + tap->params.name, + strlen(tap->params.name) + 1)) + return -EFAULT; + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + case BLKTAP2_IOCTL_RESUME: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + tap->ring.response = (int)arg; + if (!tap->ring.response) + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + } + + return -ENOIOCTLCMD; +} + +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + + poll_wait(filp, &ring->poll_wait, wait); + if (ring->ring.sring->private.tapif_user.msg || + ring->ring.req_prod_pvt != ring->ring.sring->req_prod) { + RING_PUSH_REQUESTS(&ring->ring); + return POLLIN | POLLRDNORM; + } + + return 0; +} + +static struct file_operations blktap_ring_file_operations = { + .owner = THIS_MODULE, + .open = blktap_ring_open, + .release = blktap_ring_release, + .ioctl = blktap_ring_ioctl, + .mmap = blktap_ring_mmap, + .poll = blktap_ring_poll, +}; + +void +blktap_ring_kick_user(struct blktap *tap) +{ + wake_up_interruptible(&tap->ring.poll_wait); +} + +int +blktap_ring_resume(struct blktap *tap) +{ + int err; + struct blktap_ring *ring = &tap->ring; + + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + /* set shared flag for resume */ + ring->response = 0; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME); + blktap_ring_kick_user(tap); + + wait_event_interruptible(tap->wq, ring->response || + !test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + + err = ring->response; + ring->response = 0; + + BTDBG("err: %d\n", err); + + if (err) + return err; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_pause(struct blktap *tap) +{ + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + BTDBG("draining queue\n"); + wait_event_interruptible(tap->wq, !tap->pending_cnt); + if (tap->pending_cnt) + return -EAGAIN; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE); + blktap_ring_kick_user(tap); + + BTDBG("waiting for tapdisk response\n"); + wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_destroy(struct blktap *tap) +{ + if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) && + !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return 0; + + BTDBG("sending tapdisk close message\n"); + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE); + blktap_ring_kick_user(tap); + + return -EAGAIN; +} + +static void +blktap_ring_initialize(struct blktap_ring *ring, int minor) +{ + memset(ring, 0, sizeof(*ring)); + init_waitqueue_head(&ring->poll_wait); + ring->devno = MKDEV(blktap_ring_major, minor); +} + +int +blktap_ring_create(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + blktap_ring_initialize(ring, tap->minor); + return blktap_sysfs_create(tap); +} + +int __init +blktap_ring_init(int *major) +{ + int err; + + err = register_chrdev(0, "blktap2", &blktap_ring_file_operations); + if (err < 0) { + BTERR("error registering blktap ring device: %d\n", err); + return err; + } + + blktap_ring_major = *major = err; + BTINFO("blktap ring major: %d\n", blktap_ring_major); + return 0; +} + +int +blktap_ring_free(void) +{ + if (blktap_ring_major) + unregister_chrdev(blktap_ring_major, "blktap2"); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/sysfs.c 2011-03-02 12:00:16.000000000 +0100 @@ -0,0 +1,425 @@ +#include +#include +#include + +#include "blktap.h" + +int blktap_debug_level = 1; + +static struct class *class; +static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq); + +static inline void +blktap_sysfs_get(struct blktap *tap) +{ + atomic_inc(&tap->ring.sysfs_refcnt); +} + +static inline void +blktap_sysfs_put(struct blktap *tap) +{ + if (atomic_dec_and_test(&tap->ring.sysfs_refcnt)) + wake_up(&sysfs_wq); +} + +static inline void +blktap_sysfs_enter(struct blktap *tap) +{ + blktap_sysfs_get(tap); /* pin sysfs device */ + mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */ +} + +static inline void +blktap_sysfs_exit(struct blktap *tap) +{ + mutex_unlock(&tap->ring.sysfs_mutex); + blktap_sysfs_put(tap); +} + +static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t); +static CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); +static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t); +static CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); + +static ssize_t +blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EPERM; + goto out; + } + + if (size > BLKTAP2_MAX_MESSAGE_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) { + err = -EINVAL; + goto out; + } + + snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf); + err = size; + +out: + blktap_sysfs_exit(tap); + return err; +} + +static ssize_t +blktap_sysfs_get_name(struct class_device *dev, char *buf) +{ + ssize_t size; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev) + size = -ENODEV; + else if (tap->params.name[0]) + size = sprintf(buf, "%s\n", tap->params.name); + else + size = sprintf(buf, "%d\n", tap->minor); + + blktap_sysfs_exit(tap); + + return size; +} +static CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); + +static ssize_t +blktap_sysfs_remove_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + if (!tap->ring.dev) + return size; + + if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -EBUSY; + + err = blktap_control_destroy_device(tap); + + return (err ? : size); +} +static CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); + +static ssize_t +blktap_sysfs_pause_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + BTDBG("pausing %u:%u: dev_inuse: %lu\n", + MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + err = -EBUSY; + goto out; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = 0; + goto out; + } + + err = blktap_device_pause(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_resume); + } + +out: + blktap_sysfs_exit(tap); + + return (err ? err : size); +} + +static ssize_t +blktap_sysfs_resume_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = blktap_device_resume(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_resume); + class_device_create_file(dev, &class_device_attr_pause); + } + +out: + blktap_sysfs_exit(tap); + + BTDBG("returning %zd\n", (err ? err : size)); + return (err ? err : size); +} + +#ifdef ENABLE_PASSTHROUGH +static ssize_t +blktap_sysfs_enable_passthrough(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + unsigned major, minor; + struct blktap *tap = (struct blktap *)dev->class_data; + + BTINFO("passthrough request enabled\n"); + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = sscanf(buf, "%x:%x", &major, &minor); + if (err != 2) { + err = -EINVAL; + goto out; + } + + err = blktap_device_enable_passthrough(tap, major, minor); + +out: + blktap_sysfs_exit(tap); + BTDBG("returning %d\n", (err ? err : size)); + return (err ? err : size); +} +#endif + +static ssize_t +blktap_sysfs_debug_device(struct class_device *dev, char *buf) +{ + char *tmp; + int i, ret; + struct blktap *tap = (struct blktap *)dev->class_data; + + tmp = buf; + blktap_sysfs_get(tap); + + if (!tap->ring.dev) { + ret = sprintf(tmp, "no device\n"); + goto out; + } + + tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n", + tap->params.name, MAJOR(tap->ring.devno), + MINOR(tap->ring.devno), atomic_read(&tap->refcnt), + tap->dev_inuse); + tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, " + "device users: %d\n", tap->params.capacity, + tap->params.sector_size, tap->device.users); + + down_read(&tap->tap_sem); + + tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt); + for (i = 0; i < MAX_PENDING_REQS; i++) { + struct blktap_request *req = tap->pending_requests[i]; + if (!req) + continue; + + tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, " + "status: 0x%02x, pendcnt: %d, " + "nr_pages: %u, op: %d, time: %lu:%lu\n", + i, (unsigned long long)req->id, req->usr_idx, + req->status, atomic_read(&req->pendcnt), + req->nr_pages, req->operation, req->time.tv_sec, + req->time.tv_usec); + } + + up_read(&tap->tap_sem); + ret = (tmp - buf) + 1; + +out: + blktap_sysfs_put(tap); + BTDBG("%s\n", buf); + + return ret; +} +static CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); + +int +blktap_sysfs_create(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + if (!class) + return -ENODEV; + + ring = &tap->ring; + + dev = class_device_create(class, NULL, ring->devno, + NULL, "blktap%d", tap->minor); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ring->dev = dev; + dev->class_data = tap; + + mutex_init(&ring->sysfs_mutex); + atomic_set(&ring->sysfs_refcnt, 0); + set_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + class_device_create_file(dev, &class_device_attr_name); + class_device_create_file(dev, &class_device_attr_remove); + class_device_create_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_debug); + + return 0; +} + +int +blktap_sysfs_destroy(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + ring = &tap->ring; + dev = ring->dev; + if (!class || !dev) + return 0; + + ring->dev = NULL; + if (wait_event_interruptible(sysfs_wq, + !atomic_read(&tap->ring.sysfs_refcnt))) + return -EAGAIN; + + /* XXX: is it safe to remove the class from a sysfs attribute? */ + class_device_remove_file(dev, &class_device_attr_name); + class_device_remove_file(dev, &class_device_attr_remove); + class_device_remove_file(dev, &class_device_attr_pause); + class_device_remove_file(dev, &class_device_attr_resume); + class_device_remove_file(dev, &class_device_attr_debug); + class_device_destroy(class, ring->devno); + + clear_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + return 0; +} + +static ssize_t +blktap_sysfs_show_verbosity(struct class *class, char *buf) +{ + return sprintf(buf, "%d\n", blktap_debug_level); +} + +static ssize_t +blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) +{ + int level; + + if (sscanf(buf, "%d", &level) == 1) { + blktap_debug_level = level; + return size; + } + + return -EINVAL; +} +static CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR, + blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); + +static ssize_t +blktap_sysfs_show_devices(struct class *class, char *buf) +{ + int i, ret; + struct blktap *tap; + + ret = 0; + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) { + tap = blktaps[i]; + if (!tap) + continue; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + continue; + + ret += sprintf(buf + ret, "%d ", tap->minor); + ret += snprintf(buf + ret, sizeof(tap->params.name) - 1, + tap->params.name); + ret += sprintf(buf + ret, "\n"); + } + + return ret; +} +static CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL); + +void +blktap_sysfs_free(void) +{ + if (!class) + return; + + class_remove_file(class, &class_attr_verbosity); + class_remove_file(class, &class_attr_devices); + + class_destroy(class); +} + +int __init +blktap_sysfs_init(void) +{ + struct class *cls; + + if (class) + return -EEXIST; + + cls = class_create(THIS_MODULE, "blktap2"); + if (IS_ERR(cls)) + return PTR_ERR(cls); + + class_create_file(cls, &class_attr_verbosity); + class_create_file(cls, &class_attr_devices); + + class = cls; + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/blktap2/wait_queue.c 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,40 @@ +#include +#include + +#include "blktap.h" + +static LIST_HEAD(deferred_work_queue); +static DEFINE_SPINLOCK(deferred_work_lock); + +void +blktap_run_deferred(void) +{ + LIST_HEAD(queue); + struct blktap *tap; + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + list_splice_init(&deferred_work_queue, &queue); + list_for_each_entry(tap, &queue, deferred_queue) + clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + spin_unlock_irqrestore(&deferred_work_lock, flags); + + while (!list_empty(&queue)) { + tap = list_entry(queue.next, struct blktap, deferred_queue); + list_del_init(&tap->deferred_queue); + blktap_device_restart(tap); + } +} + +void +blktap_defer(struct blktap *tap) +{ + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) { + set_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + list_add_tail(&tap->deferred_queue, &deferred_work_queue); + } + spin_unlock_irqrestore(&deferred_work_lock, flags); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/char/Makefile 2007-07-10 09:42:30.000000000 +0200 @@ -0,0 +1 @@ +obj-$(CONFIG_XEN_DEVMEM) := mem.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/char/mem.c 2007-08-06 15:10:49.000000000 +0200 @@ -0,0 +1,190 @@ +/* + * Originally from linux/drivers/char/mem.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Added devfs support. + * Jan-11-1998, C. Scott Ananian + * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline int uncached_access(struct file *file) +{ + if (file->f_flags & O_SYNC) + return 1; + /* Xen sets correct MTRR type on non-RAM for us. */ + return 0; +} + +/* + * This funcion reads the *physical* memory. The f_pos points directly to the + * memory location. + */ +static ssize_t read_mem(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos, ignored; + ssize_t read = 0, sz; + void __iomem *v; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + v = ioremap(p, sz); + if (IS_ERR(v) || v == NULL) { + /* + * Some programs (e.g., dmidecode) groove off into + * weird RAM areas where no tables can possibly exist + * (because Xen will have stomped on them!). These + * programs get rather upset if we let them know that + * Xen failed their access, so we fake out a read of + * all zeroes. + */ + if (clear_user(buf, count)) + return -EFAULT; + read += count; + break; + } + + ignored = copy_to_user(buf, v, sz); + iounmap(v); + if (ignored) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + + *ppos += read; + return read; +} + +static ssize_t write_mem(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + unsigned long p = *ppos, ignored; + ssize_t written = 0, sz; + void __iomem *v; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + v = ioremap(p, sz); + if (v == NULL) + break; + if (IS_ERR(v)) { + if (written == 0) + return PTR_ERR(v); + break; + } + + ignored = copy_from_user(v, buf, sz); + iounmap(v); + if (ignored) { + written += sz - ignored; + if (written) + break; + return -EFAULT; + } + buf += sz; + p += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; +} + +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM +static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if (uncached_access(file)) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + /* We want to return the real error code, not EAGAIN. */ + return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot, DOMID_IO); +} +#endif + +/* + * The memory devices use the full 32/64 bits of the offset, and so we cannot + * check against negative addresses: they are ok. The return value is weird, + * though, in that case (0). + * + * also note that seeking relative to the "end of file" isn't supported: + * it has no meaning, so it returns -EINVAL. + */ +static loff_t memory_lseek(struct file * file, loff_t offset, int orig) +{ + loff_t ret; + + mutex_lock(&file->f_dentry->d_inode->i_mutex); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + force_successful_syscall_return(); + break; + default: + ret = -EINVAL; + } + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return ret; +} + +static int open_mem(struct inode * inode, struct file * filp) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +const struct file_operations mem_fops = { + .llseek = memory_lseek, + .read = read_mem, + .write = write_mem, + .mmap = xen_mmap_mem, + .open = open_mem, +}; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/console/Makefile 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,2 @@ + +obj-y := console.o xencons_ring.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/console/console.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,753 @@ +/****************************************************************************** + * console.c + * + * Virtual console driver. + * + * Copyright (c) 2002-2004, K A Fraser. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Modes: + * 'xencons=off' [XC_OFF]: Console is disabled. + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. + * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'. + * 'xencons=hvc' [XC_HVC]: Console attached to '/dev/hvc0'. + * default: XC_XVC + * + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses + * warnings from standard distro startup scripts. + */ +static enum { + XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC +} xc_mode = XC_XVC; +static int xc_num = -1; + +/* /dev/xvc0 device number allocated by lanana.org. */ +#define XEN_XVC_MAJOR 204 +#define XEN_XVC_MINOR 191 + +/* /dev/hvc0 device number */ +#define XEN_HVC_MAJOR 229 +#define XEN_HVC_MINOR 0 + +#ifdef CONFIG_MAGIC_SYSRQ +static unsigned long sysrq_requested; +extern int sysrq_enabled; +#endif + +static int __init xencons_setup(char *str) +{ + char *q; + int n; + extern int console_use_vt; + + console_use_vt = 1; + if (!strncmp(str, "ttyS", 4)) { + xc_mode = XC_SERIAL; + str += 4; + } else if (!strncmp(str, "tty", 3)) { + xc_mode = XC_TTY; + str += 3; + console_use_vt = 0; + } else if (!strncmp(str, "xvc", 3)) { + xc_mode = XC_XVC; + str += 3; + } else if (!strncmp(str, "hvc", 3)) { + xc_mode = XC_HVC; + str += 3; + } else if (!strncmp(str, "off", 3)) { + xc_mode = XC_OFF; + str += 3; + } + + n = simple_strtol(str, &q, 10); + if (q != str) + xc_num = n; + + return 1; +} +__setup("xencons=", xencons_setup); + +/* The kernel and user-land drivers share a common transmit buffer. */ +static unsigned int wbuf_size = 4096; +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1)) +static char *wbuf; +static unsigned int wc, wp; /* write_cons, write_prod */ + +static int __init xencons_bufsz_setup(char *str) +{ + unsigned int goal; + goal = simple_strtoul(str, NULL, 0); + if (goal) { + goal = roundup_pow_of_two(goal); + if (wbuf_size < goal) + wbuf_size = goal; + } + return 1; +} +__setup("xencons_bufsz=", xencons_bufsz_setup); + +/* This lock protects accesses to the common transmit buffer. */ +static DEFINE_SPINLOCK(xencons_lock); + +/* Common transmit-kick routine. */ +static void __xencons_tx_flush(void); + +static struct tty_driver *xencons_driver; + +/******************** Kernel console driver ********************************/ + +static void kcons_write(struct console *c, const char *s, unsigned int count) +{ + int i = 0; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + while (i < count) { + for (; i < count; i++) { + if ((wp - wc) >= (wbuf_size - 1)) + break; + if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n') + wbuf[WBUF_MASK(wp++)] = '\r'; + } + + __xencons_tx_flush(); + } + + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void kcons_write_dom0(struct console *c, const char *s, unsigned int count) +{ + + while (count > 0) { + int rc; + rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s); + if (rc <= 0) + break; + count -= rc; + s += rc; + } +} + +static struct tty_driver *kcons_device(struct console *c, int *index) +{ + *index = 0; + return xencons_driver; +} + +static struct console kcons_info = { + .device = kcons_device, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +static int __init xen_console_init(void) +{ + if (!is_running_on_xen()) + goto out; + + if (is_initial_xendomain()) { + kcons_info.write = kcons_write_dom0; + } else { + if (!xen_start_info->console.domU.evtchn) + goto out; + kcons_info.write = kcons_write; + } + + switch (xc_mode) { + case XC_XVC: + strcpy(kcons_info.name, "xvc"); + if (xc_num == -1) + xc_num = 0; + break; + + case XC_HVC: + strcpy(kcons_info.name, "hvc"); + if (xc_num == -1) + xc_num = 0; + if (!is_initial_xendomain()) + add_preferred_console(kcons_info.name, xc_num, NULL); + break; + + case XC_SERIAL: + strcpy(kcons_info.name, "ttyS"); + if (xc_num == -1) + xc_num = 0; + break; + + case XC_TTY: + strcpy(kcons_info.name, "tty"); + if (xc_num == -1) + xc_num = 1; + break; + + default: + goto out; + } + + wbuf = alloc_bootmem(wbuf_size); + + register_console(&kcons_info); + + out: + return 0; +} +console_initcall(xen_console_init); + +/*** Useful function for console debugging -- goes straight to Xen. ***/ +asmlinkage int xprintk(const char *fmt, ...) +{ + va_list args; + int printk_len; + static char printk_buf[1024]; + + /* Emit the output into the temporary buffer */ + va_start(args, fmt); + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); + va_end(args); + + /* Send the processed output directly to Xen. */ + kcons_write_dom0(NULL, printk_buf, printk_len); + + return 0; +} + +/*** Forcibly flush console data before dying. ***/ +void xencons_force_flush(void) +{ + int sz; + + /* Emergency console is synchronous, so there's nothing to flush. */ + if (!is_running_on_xen() || + is_initial_xendomain() || + !xen_start_info->console.domU.evtchn) + return; + + /* Spin until console data is flushed through to the daemon. */ + while (wc != wp) { + int sent = 0; + if ((sz = wp - wc) == 0) + continue; + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) + wc += sent; + } +} + + +void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size) +{ + /* This is drawn from a dump from vgacon:startup in + * standard Linux. */ + screen_info.orig_video_mode = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = 25; + screen_info.orig_video_cols = 80; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_points = 16; + screen_info.orig_y = screen_info.orig_video_lines - 1; + + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) + + sizeof(info->u.text_mode_3)) + break; + screen_info.orig_video_lines = info->u.text_mode_3.rows; + screen_info.orig_video_cols = info->u.text_mode_3.columns; + screen_info.orig_x = info->u.text_mode_3.cursor_x; + screen_info.orig_y = info->u.text_mode_3.cursor_y; + screen_info.orig_video_points = + info->u.text_mode_3.font_height; + break; + + case XEN_VGATYPE_VESA_LFB: + if (size < offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps)) + break; + screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info.lfb_width = info->u.vesa_lfb.width; + screen_info.lfb_height = info->u.vesa_lfb.height; + screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info.lfb_base = info->u.vesa_lfb.lfb_base; + screen_info.lfb_size = info->u.vesa_lfb.lfb_size; + screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info.red_size = info->u.vesa_lfb.red_size; + screen_info.red_pos = info->u.vesa_lfb.red_pos; + screen_info.green_size = info->u.vesa_lfb.green_size; + screen_info.green_pos = info->u.vesa_lfb.green_pos; + screen_info.blue_size = info->u.vesa_lfb.blue_size; + screen_info.blue_pos = info->u.vesa_lfb.blue_pos; + screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.gbl_caps) + + sizeof(info->u.vesa_lfb.gbl_caps)) + screen_info.capabilities = info->u.vesa_lfb.gbl_caps; + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.mode_attrs) + + sizeof(info->u.vesa_lfb.mode_attrs)) + screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs; + break; + } +} + + +/******************** User-space console driver (/dev/console) ************/ + +#define DRV(_d) (_d) +#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \ + ((_tty)->index != (xc_num - 1))) + +static struct termios *xencons_termios[MAX_NR_CONSOLES]; +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; +static struct tty_struct *xencons_tty; +static int xencons_priv_irq; +static char x_char; + +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + if (xencons_tty == NULL) + goto out; + + for (i = 0; i < len; i++) { +#ifdef CONFIG_MAGIC_SYSRQ + if (sysrq_enabled) { + if (buf[i] == '\x0f') { /* ^O */ + if (!sysrq_requested) { + sysrq_requested = jiffies; + continue; /* don't print sysrq key */ + } + sysrq_requested = 0; + } else if (sysrq_requested) { + unsigned long sysrq_timeout = + sysrq_requested + HZ*2; + sysrq_requested = 0; + if (time_before(jiffies, sysrq_timeout)) { + spin_unlock_irqrestore( + &xencons_lock, flags); + handle_sysrq( + buf[i], regs, xencons_tty); + spin_lock_irqsave( + &xencons_lock, flags); + continue; + } + } + } +#endif + tty_insert_flip_char(xencons_tty, buf[i], 0); + } + tty_flip_buffer_push(xencons_tty); + + out: + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void __xencons_tx_flush(void) +{ + int sent, sz, work_done = 0; + + if (x_char) { + if (is_initial_xendomain()) + kcons_write_dom0(NULL, &x_char, 1); + else + while (x_char) + if (xencons_ring_send(&x_char, 1) == 1) + break; + x_char = 0; + work_done = 1; + } + + while (wc != wp) { + sz = wp - wc; + if (sz > (wbuf_size - WBUF_MASK(wc))) + sz = wbuf_size - WBUF_MASK(wc); + if (is_initial_xendomain()) { + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); + wc += sz; + } else { + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent == 0) + break; + wc += sent; + } + work_done = 1; + } + + if (work_done && (xencons_tty != NULL)) { + wake_up_interruptible(&xencons_tty->write_wait); + if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + (xencons_tty->ldisc.write_wakeup != NULL)) + (xencons_tty->ldisc.write_wakeup)(xencons_tty); + } +} + +void xencons_tx(void) +{ + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +/* Privileged receive callback and transmit kicker. */ +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + static char rbuf[16]; + int l; + + while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) + xencons_rx(rbuf, l, regs); + + xencons_tx(); + + return IRQ_HANDLED; +} + +static int xencons_write_room(struct tty_struct *tty) +{ + return wbuf_size - (wp - wc); +} + +static int xencons_chars_in_buffer(struct tty_struct *tty) +{ + return wp - wc; +} + +static void xencons_send_xchar(struct tty_struct *tty, char ch) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + x_char = ch; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_throttle(struct tty_struct *tty) +{ + if (DUMMY_TTY(tty)) + return; + + if (I_IXOFF(tty)) + xencons_send_xchar(tty, STOP_CHAR(tty)); +} + +static void xencons_unthrottle(struct tty_struct *tty) +{ + if (DUMMY_TTY(tty)) + return; + + if (I_IXOFF(tty)) { + if (x_char != 0) + x_char = 0; + else + xencons_send_xchar(tty, START_CHAR(tty)); + } +} + +static void xencons_flush_buffer(struct tty_struct *tty) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + wc = wp = 0; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static inline int __xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ((wp - wc) == wbuf_size) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + +static int xencons_write( + struct tty_struct *tty, + const unsigned char *buf, + int count) +{ + int i; + unsigned long flags; + + if (DUMMY_TTY(tty)) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for (i = 0; i < count; i++) + if (!__xencons_put_char(buf[i])) + break; + + if (i != 0) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; +} + +static void xencons_put_char(struct tty_struct *tty, u_char ch) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + (void)__xencons_put_char(ch); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_flush_chars(struct tty_struct *tty) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) +{ + unsigned long orig_jiffies = jiffies; + + if (DUMMY_TTY(tty)) + return; + + while (DRV(tty->driver)->chars_in_buffer(tty)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + if (signal_pending(current)) + break; + if (timeout && time_after(jiffies, orig_jiffies + timeout)) + break; + } + + set_current_state(TASK_RUNNING); +} + +static int xencons_open(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return 0; + + spin_lock_irqsave(&xencons_lock, flags); + tty->driver_data = NULL; + if (xencons_tty == NULL) + xencons_tty = tty; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); + + return 0; +} + +static void xencons_close(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if (DUMMY_TTY(tty)) + return; + + mutex_lock(&tty_mutex); + + if (tty->count != 1) { + mutex_unlock(&tty_mutex); + return; + } + + /* Prevent other threads from re-opening this tty. */ + set_bit(TTY_CLOSING, &tty->flags); + mutex_unlock(&tty_mutex); + + tty->closing = 1; + tty_wait_until_sent(tty, 0); + if (DRV(tty->driver)->flush_buffer != NULL) + DRV(tty->driver)->flush_buffer(tty); + if (tty->ldisc.flush_buffer != NULL) + tty->ldisc.flush_buffer(tty); + tty->closing = 0; + spin_lock_irqsave(&xencons_lock, flags); + xencons_tty = NULL; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static struct tty_operations xencons_ops = { + .open = xencons_open, + .close = xencons_close, + .write = xencons_write, + .write_room = xencons_write_room, + .put_char = xencons_put_char, + .flush_chars = xencons_flush_chars, + .chars_in_buffer = xencons_chars_in_buffer, + .send_xchar = xencons_send_xchar, + .flush_buffer = xencons_flush_buffer, + .throttle = xencons_throttle, + .unthrottle = xencons_unthrottle, + .wait_until_sent = xencons_wait_until_sent, +}; + +static int __init xencons_init(void) +{ + int rc; + + if (!is_running_on_xen()) + return -ENODEV; + + if (xc_mode == XC_OFF) + return 0; + + if (!is_initial_xendomain()) { + rc = xencons_ring_init(); + if (rc) + return rc; + } + + xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ? + MAX_NR_CONSOLES : 1); + if (xencons_driver == NULL) + return -ENOMEM; + + DRV(xencons_driver)->name = "xencons"; + DRV(xencons_driver)->major = TTY_MAJOR; + DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; + DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; + DRV(xencons_driver)->init_termios = tty_std_termios; + DRV(xencons_driver)->flags = + TTY_DRIVER_REAL_RAW | + TTY_DRIVER_RESET_TERMIOS; + DRV(xencons_driver)->termios = xencons_termios; + DRV(xencons_driver)->termios_locked = xencons_termios_locked; + + switch (xc_mode) { + case XC_XVC: + DRV(xencons_driver)->name = "xvc"; + DRV(xencons_driver)->major = XEN_XVC_MAJOR; + DRV(xencons_driver)->minor_start = XEN_XVC_MINOR; + DRV(xencons_driver)->name_base = xc_num; + break; + case XC_HVC: + DRV(xencons_driver)->name = "hvc"; + DRV(xencons_driver)->major = XEN_HVC_MAJOR; + DRV(xencons_driver)->minor_start = XEN_HVC_MINOR; + DRV(xencons_driver)->name_base = xc_num; + break; + case XC_SERIAL: + DRV(xencons_driver)->name = "ttyS"; + DRV(xencons_driver)->minor_start = 64 + xc_num; + DRV(xencons_driver)->name_base = xc_num; + break; + default: + DRV(xencons_driver)->name = "tty"; + DRV(xencons_driver)->minor_start = 1; + DRV(xencons_driver)->name_base = 1; + break; + } + + tty_set_operations(xencons_driver, &xencons_ops); + + if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) { + printk("WARNING: Failed to register Xen virtual " + "console driver as '%s%d'\n", + DRV(xencons_driver)->name, + DRV(xencons_driver)->name_base); + put_tty_driver(xencons_driver); + xencons_driver = NULL; + return rc; + } + + if (is_initial_xendomain()) { + xencons_priv_irq = bind_virq_to_irqhandler( + VIRQ_CONSOLE, + 0, + xencons_priv_interrupt, + 0, + "console", + NULL); + BUG_ON(xencons_priv_irq < 0); + } + + printk("Xen virtual console successfully installed as %s%d\n", + DRV(xencons_driver)->name, xc_num); + + return 0; +} + +module_init(xencons_init); + +MODULE_LICENSE("Dual BSD/GPL"); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/console/xencons_ring.c 2007-06-12 13:13:44.000000000 +0200 @@ -0,0 +1,143 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static int xencons_irq; + +static inline struct xencons_interface *xencons_interface(void) +{ + return mfn_to_virt(xen_start_info->console.domU.mfn); +} + +static inline void notify_daemon(void) +{ + /* Use evtchn: this is called early, before irq is set up. */ + notify_remote_via_evtchn(xen_start_info->console.domU.evtchn); +} + +int xencons_ring_send(const char *data, unsigned len) +{ + int sent = 0; + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; + + cons = intf->out_cons; + prod = intf->out_prod; + mb(); + BUG_ON((prod - cons) > sizeof(intf->out)); + + while ((sent < len) && ((prod - cons) < sizeof(intf->out))) + intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; + + wmb(); + intf->out_prod = prod; + + notify_daemon(); + + return sent; +} + +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs) +{ + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; + + cons = intf->in_cons; + prod = intf->in_prod; + mb(); + BUG_ON((prod - cons) > sizeof(intf->in)); + + while (cons != prod) { + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs); + cons++; + } + + mb(); + intf->in_cons = cons; + + notify_daemon(); + + xencons_tx(); + + return IRQ_HANDLED; +} + +int xencons_ring_init(void) +{ + int irq; + + if (xencons_irq) + unbind_from_irqhandler(xencons_irq, NULL); + xencons_irq = 0; + + if (!is_running_on_xen() || + is_initial_xendomain() || + !xen_start_info->console.domU.evtchn) + return -ENODEV; + + irq = bind_caller_port_to_irqhandler( + xen_start_info->console.domU.evtchn, + handle_input, 0, "xencons", NULL); + if (irq < 0) { + printk(KERN_ERR "XEN console request irq failed %i\n", irq); + return irq; + } + + xencons_irq = irq; + + /* In case we have in-flight data after save/restore... */ + notify_daemon(); + + return 0; +} + +void xencons_resume(void) +{ + (void)xencons_ring_init(); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/core/Makefile 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,14 @@ +# +# Makefile for the linux kernel. +# + +obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o + +obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_PROC_FS) += xen_proc.o +obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor_sysfs.o +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o +obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o +obj-$(CONFIG_KEXEC) += machine_kexec.o +obj-$(CONFIG_XEN_XENCOMM) += xencomm.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/core/cpu_hotplug.c 2011-01-24 12:06:05.000000000 +0100 @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Set of CPUs that remote admin software will allow us to bring online. + * Notified to us via xenbus. + */ +static cpumask_t xenbus_allowed_cpumask; + +/* Set of CPUs that local admin will allow us to bring online. */ +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL; + +static int local_cpu_hotplug_request(void) +{ + /* + * We assume a CPU hotplug request comes from local admin if it is made + * via a userspace process (i.e., one with a real mm_struct). + */ + return (current->mm != NULL); +} + +static void vcpu_hotplug(unsigned int cpu, struct sys_device *dev) +{ + int err; + char dir[32], state[32]; + + if ((cpu >= NR_CPUS) || !cpu_possible(cpu)) + return; + + sprintf(dir, "cpu/%u", cpu); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); + if (err != 1) { + printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); + return; + } + + if (strcmp(state, "online") == 0) { + cpu_set(cpu, xenbus_allowed_cpumask); + if (!cpu_up(cpu) && dev) + kobject_uevent(&dev->kobj, KOBJ_ONLINE); + } else if (strcmp(state, "offline") == 0) { + cpu_clear(cpu, xenbus_allowed_cpumask); + if (!cpu_down(cpu) && dev) + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); + } else { + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", + state, cpu); + } +} + +static void handle_vcpu_hotplug_event( + struct xenbus_watch *watch, const char **vec, unsigned int len) +{ + unsigned int cpu; + char *cpustr; + const char *node = vec[XS_WATCH_PATH]; + + if ((cpustr = strstr(node, "cpu/")) != NULL) { + sscanf(cpustr, "cpu/%u", &cpu); + vcpu_hotplug(cpu, get_cpu_sysdev(cpu)); + } +} + +static int smpboot_cpu_notify(struct notifier_block *notifier, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* + * We do this in a callback notifier rather than __cpu_disable() + * because local_cpu_hotplug_request() does not work in the latter + * as it's always executed from within a stopmachine kthread. + */ + if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request()) + cpu_clear(cpu, local_allowed_cpumask); + + return NOTIFY_OK; +} + +static int setup_cpu_watcher(struct notifier_block *notifier, + unsigned long event, void *data) +{ + unsigned int i; + + static struct xenbus_watch cpu_watch = { + .node = "cpu", + .callback = handle_vcpu_hotplug_event, + .flags = XBWF_new_thread }; + (void)register_xenbus_watch(&cpu_watch); + + if (!is_initial_xendomain()) { + for_each_possible_cpu(i) + vcpu_hotplug(i, get_cpu_sysdev(i)); + printk(KERN_INFO "Brought up %ld CPUs\n", + (long)num_online_cpus()); + } + + return NOTIFY_DONE; +} + +static int __init setup_vcpu_hotplug_event(void) +{ + static struct notifier_block hotplug_cpu = { + .notifier_call = smpboot_cpu_notify }; + static struct notifier_block xsn_cpu = { + .notifier_call = setup_cpu_watcher }; + + if (!is_running_on_xen()) + return -ENODEV; + + register_cpu_notifier(&hotplug_cpu); + register_xenstore_notifier(&xsn_cpu); + + return 0; +} + +arch_initcall(setup_vcpu_hotplug_event); + +int smp_suspend(void) +{ + unsigned int cpu; + int err; + + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + err = cpu_down(cpu); + if (err) { + printk(KERN_CRIT "Failed to take all CPUs " + "down: %d.\n", err); + for_each_possible_cpu(cpu) + vcpu_hotplug(cpu, NULL); + return err; + } + } + + return 0; +} + +void smp_resume(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + vcpu_hotplug(cpu, NULL); + } +} + +int cpu_up_check(unsigned int cpu) +{ + int rc = 0; + + if (local_cpu_hotplug_request()) { + cpu_set(cpu, local_allowed_cpumask); + if (!cpu_isset(cpu, xenbus_allowed_cpumask)) { + printk("%s: attempt to bring up CPU %u disallowed by " + "remote admin.\n", __FUNCTION__, cpu); + rc = -EBUSY; + } + } else if (!cpu_isset(cpu, local_allowed_cpumask) || + !cpu_isset(cpu, xenbus_allowed_cpumask)) { + rc = -EBUSY; + } + + return rc; +} + +void init_xenbus_allowed_cpumask(void) +{ + xenbus_allowed_cpumask = cpu_present_map; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/core/evtchn.c 2010-11-25 09:36:37.000000000 +0100 @@ -0,0 +1,1204 @@ +/****************************************************************************** + * evtchn.c + * + * Communication via Xen event channels. + * + * Copyright (c) 2002-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* RTC_IRQ */ + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> event-channel mappings. */ +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 }; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +static u32 irq_info[NR_IRQS]; + +/* Binding types. */ +enum { + IRQT_UNBOUND, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_LOCAL_PORT, + IRQT_CALLER_PORT, + _IRQT_COUNT +}; + +#define _IRQT_BITS 4 +#define _EVTCHN_BITS 12 +#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS) + +/* Constructor for packed IRQ information. */ +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + BUILD_BUG_ON(_IRQT_COUNT > (1U << _IRQT_BITS)); + + BUILD_BUG_ON(NR_PIRQS > (1U << _INDEX_BITS)); + BUILD_BUG_ON(NR_VIRQS > (1U << _INDEX_BITS)); + BUILD_BUG_ON(NR_IPIS > (1U << _INDEX_BITS)); + BUG_ON(index >> _INDEX_BITS); + + BUILD_BUG_ON(NR_EVENT_CHANNELS > (1U << _EVTCHN_BITS)); + + return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn); +} + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +/* + * Accessors for packed IRQ information. + */ + +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq] & ((1U << _EVTCHN_BITS) - 1); +} + +static inline unsigned int index_from_irq(int irq) +{ + return (irq_info[irq] >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1); +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq] >> (32 - _IRQT_BITS); +} + +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping. */ +#ifndef NR_IPIS +#define NR_IPIS 1 +#endif +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +#ifdef CONFIG_SMP + +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + shared_info_t *s = HYPERVISOR_shared_info; + int irq = evtchn_to_irq[chn]; + + BUG_ON(!test_bit(chn, s->evtchn_mask)); + + if (irq != -1) + set_native_irq_info(irq, cpumask_of_cpu(cpu)); + + clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); + set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ + int i; + + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + set_native_irq_info(i, cpumask_of_cpu(0)); + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + for_each_possible_cpu(i) + memset(cpu_evtchn_mask[i], + (i == 0) ? ~0 : 0, + sizeof(cpu_evtchn_mask[i])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +#else + +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ +} + +static void init_evtchn_cpu_bindings(void) +{ +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return 0; +} + +#endif + +/* Upcall to generic IRQ layer. */ +#ifdef CONFIG_X86 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs); +void __init xen_init_IRQ(void); +void __init init_IRQ(void) +{ + irq_ctx_init(0); + xen_init_IRQ(); +} +#if defined (__i386__) +static inline void exit_idle(void) {} +#define IRQ_REG orig_eax +#elif defined (__x86_64__) +#include +#define IRQ_REG orig_rax +#endif +#define do_IRQ(irq, regs) do { \ + (regs)->IRQ_REG = ~(irq); \ + do_IRQ((regs)); \ +} while (0) +#endif + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + VOID(HYPERVISOR_xen_version(0, NULL)); +} +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ +EXPORT_SYMBOL(force_evtchn_callback); + +static DEFINE_PER_CPU(unsigned int, upcall_count); +static DEFINE_PER_CPU(unsigned int, current_l1i); +static DEFINE_PER_CPU(unsigned int, current_l2i); + +/* NB. Interrupts are disabled on entry. */ +asmlinkage void evtchn_do_upcall(struct pt_regs *regs) +{ + unsigned long l1, l2; + unsigned long masked_l1, masked_l2; + unsigned int l1i, l2i, start_l1i, start_l2i, port, count, i; + int irq; + unsigned int cpu = smp_processor_id(); + shared_info_t *s = HYPERVISOR_shared_info; + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; + + exit_idle(); + irq_enter(); + + do { + /* Avoid a callback storm when we reenable delivery. */ + vcpu_info->evtchn_upcall_pending = 0; + + /* Nested invocations bail immediately. */ + if (unlikely(per_cpu(upcall_count, cpu)++)) + break; + +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ + /* Clear master flag /before/ clearing selector flag. */ + wmb(); +#endif + + /* + * Handle timer interrupts before all others, so that all + * hardirq handlers see an up-to-date system time even if we + * have just woken from a long idle period. + */ + if ((irq = __get_cpu_var(virq_to_irq)[VIRQ_TIMER]) != -1) { + port = evtchn_from_irq(irq); + l1i = port / BITS_PER_LONG; + l2i = port % BITS_PER_LONG; + if (active_evtchns(cpu, s, l1i) & (1ul<evtchn_pending_sel, 0); + + start_l1i = l1i = per_cpu(current_l1i, cpu); + start_l2i = per_cpu(current_l2i, cpu); + + for (i = 0; l1 != 0; i++) { + masked_l1 = l1 & ((~0UL) << l1i); + /* If we masked out all events, wrap to beginning. */ + if (masked_l1 == 0) { + l1i = l2i = 0; + continue; + } + l1i = __ffs(masked_l1); + + l2 = active_evtchns(cpu, s, l1i); + l2i = 0; /* usually scan entire word from start */ + if (l1i == start_l1i) { + /* We scan the starting word in two parts. */ + if (i == 0) + /* 1st time: start in the middle */ + l2i = start_l2i; + else + /* 2nd time: mask bits done already */ + l2 &= (1ul << start_l2i) - 1; + } + + do { + masked_l2 = l2 & ((~0UL) << l2i); + if (masked_l2 == 0) + break; + l2i = __ffs(masked_l2); + + /* process port */ + port = (l1i * BITS_PER_LONG) + l2i; + if ((irq = evtchn_to_irq[port]) != -1) + do_IRQ(irq, regs); + else + evtchn_device_upcall(port); + + l2i = (l2i + 1) % BITS_PER_LONG; + + /* Next caller starts at last processed + 1 */ + per_cpu(current_l1i, cpu) = + l2i ? l1i : (l1i + 1) % BITS_PER_LONG; + per_cpu(current_l2i, cpu) = l2i; + + } while (l2i != 0); + + /* Scan start_l1i twice; all others once. */ + if ((l1i != start_l1i) || (i != 0)) + l1 &= ~(1UL << l1i); + + l1i = (l1i + 1) % BITS_PER_LONG; + } + + /* If there were nested callbacks then we have more to do. */ + count = per_cpu(upcall_count, cpu); + per_cpu(upcall_count, cpu) = 0; + } while (unlikely(count != 1)); + + irq_exit(); +} + +static int find_unbound_irq(void) +{ + static int warned; + int irq; + + for (irq = DYNIRQ_BASE; irq < (DYNIRQ_BASE + NR_DYNIRQS); irq++) + if (irq_bindcount[irq] == 0) + return irq; + + if (!warned) { + warned = 1; + printk(KERN_WARNING "No available IRQ to bind to: " + "increase NR_DYNIRQS.\n"); + } + + return -ENOSPC; +} + +static int bind_caller_port_to_irq(unsigned int caller_port) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = evtchn_to_irq[caller_port]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + evtchn_to_irq[caller_port] = irq; + irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_local_port_to_irq(unsigned int local_port) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + BUG_ON(evtchn_to_irq[local_port] != -1); + + if ((irq = find_unbound_irq()) < 0) { + struct evtchn_close close = { .port = local_port }; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + BUG(); + goto out; + } + + evtchn_to_irq[local_port] = irq; + irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_listening_port_to_irq(unsigned int remote_domain) +{ + struct evtchn_alloc_unbound alloc_unbound; + int err; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = remote_domain; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + + return err ? : bind_local_port_to_irq(alloc_unbound.port); +} + +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_local_port_to_irq(bind_interdomain.local_port); +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { + if ((irq = find_unbound_irq()) < 0) + goto out; + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + unsigned int cpu; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { + close.port = evtchn; + if ((type_from_irq(irq) != IRQT_CALLER_PORT) && + HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + /* Zap stats across IRQ changes of use. */ + for_each_possible_cpu(cpu) + kstat_cpu(cpu).irqs[irq] = 0; + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_caller_port_to_irqhandler( + unsigned int caller_port, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_caller_port_to_irq(caller_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler); + +int bind_listening_port_to_irqhandler( + unsigned int remote_domain, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_listening_port_to_irq(remote_domain); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler); + +int bind_interdomain_evtchn_to_irqhandler( + unsigned int remote_domain, + unsigned int remote_port, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler( + unsigned int virq, + unsigned int cpu, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_virq_to_irq(virq, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler( + unsigned int ipi, + unsigned int cpu, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +#ifdef CONFIG_SMP +void rebind_evtchn_to_cpu(int port, unsigned int cpu) +{ + struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu }; + int masked; + + masked = test_and_set_evtchn_mask(port); + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv) == 0) + bind_evtchn_to_cpu(port, cpu); + if (!masked) + unmask_evtchn(port); +} + +static void rebind_irq_to_cpu(unsigned int irq, unsigned int tcpu) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + rebind_evtchn_to_cpu(evtchn, tcpu); +} + +static void set_affinity_irq(unsigned int irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} +#endif + +int resend_irq_on_evtchn(unsigned int irq) +{ + int masked, evtchn = evtchn_from_irq(irq); + shared_info_t *s = HYPERVISOR_shared_info; + + if (!VALID_EVTCHN(evtchn)) + return 1; + + masked = test_and_set_evtchn_mask(evtchn); + synch_set_bit(evtchn, s->evtchn_pending); + if (!masked) + unmask_evtchn(evtchn); + + return 1; +} + +/* + * Interface to generic handling in irq.c + */ + +static unsigned int startup_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); + return 0; +} + +static void shutdown_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) + unmask_evtchn(evtchn); +} + +static struct hw_interrupt_type dynirq_type = { + .typename = "Dynamic-irq", + .startup = startup_dynirq, + .shutdown = shutdown_dynirq, + .enable = enable_dynirq, + .disable = disable_dynirq, + .ack = ack_dynirq, + .end = end_dynirq, +#ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, +#endif + .retrigger = resend_irq_on_evtchn, +}; + +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ +static int pirq_eoi_does_unmask; +static unsigned long *pirq_needs_eoi; + +static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq) +{ + struct physdev_eoi eoi = { .irq = evtchn_get_xen_pirq(irq) }; + + if (pirq_eoi_does_unmask) { + if (test_bit(eoi.irq, pirq_needs_eoi)) + VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi)); + else + unmask_evtchn(evtchn); + } else if (test_bit(irq - PIRQ_BASE, pirq_needs_eoi)) { + if (smp_processor_id() != cpu_from_evtchn(evtchn)) { + struct evtchn_unmask unmask = { .port = evtchn }; + struct multicall_entry mcl[2]; + + mcl[0].op = __HYPERVISOR_event_channel_op; + mcl[0].args[0] = EVTCHNOP_unmask; + mcl[0].args[1] = (unsigned long)&unmask; + mcl[1].op = __HYPERVISOR_physdev_op; + mcl[1].args[0] = PHYSDEVOP_eoi; + mcl[1].args[1] = (unsigned long)&eoi; + + if (HYPERVISOR_multicall(mcl, 2)) + BUG(); + } else { + unmask_evtchn(evtchn); + VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi)); + } + } else + unmask_evtchn(evtchn); +} + +static inline void pirq_query_unmask(int irq) +{ + struct physdev_irq_status_query irq_status; + + if (pirq_eoi_does_unmask) + return; + irq_status.irq = evtchn_get_xen_pirq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + clear_bit(irq - PIRQ_BASE, pirq_needs_eoi); + if (irq_status.flags & XENIRQSTAT_needs_eoi) + set_bit(irq - PIRQ_BASE, pirq_needs_eoi); +} + +/* + * On startup, if there is no action associated with the IRQ then we are + * probing. In this case we should not share with others as it will confuse us. + */ +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) + +static unsigned int startup_pirq(unsigned int irq) +{ + struct evtchn_bind_pirq bind_pirq; + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = evtchn_get_xen_pirq(irq); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) { + if (!probing_irq(irq)) + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", + irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(irq); + + evtchn_to_irq[evtchn] = irq; + bind_evtchn_to_cpu(evtchn, 0); + irq_info[irq] = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn); + + out: + pirq_unmask_and_notify(evtchn, irq); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + mask_evtchn(evtchn); + + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = mk_irq_info(IRQT_PIRQ, index_from_irq(irq), 0); +} + +static void enable_pirq(unsigned int irq) +{ + startup_pirq(irq); +} + +static void disable_pirq(unsigned int irq) +{ +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if ((irq_desc[irq].status & (IRQ_DISABLED|IRQ_PENDING)) == + (IRQ_DISABLED|IRQ_PENDING)) { + shutdown_pirq(irq); + } else if (VALID_EVTCHN(evtchn)) + pirq_unmask_and_notify(evtchn, irq); +} + +static struct hw_interrupt_type pirq_type = { + .typename = "Phys-irq", + .startup = startup_pirq, + .shutdown = shutdown_pirq, + .enable = enable_pirq, + .disable = disable_pirq, + .ack = ack_pirq, + .end = end_pirq, +#ifdef CONFIG_SMP + .set_affinity = set_affinity_irq, +#endif + .retrigger = resend_irq_on_evtchn, +}; + +int irq_ignore_unhandled(unsigned int irq) +{ + struct physdev_irq_status_query irq_status = { .irq = irq }; + + if (!is_running_on_xen()) + return 0; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !!(irq_status.flags & XENIRQSTAT_shared); +} + +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +int irq_to_evtchn_port(int irq) +{ + return evtchn_from_irq(irq); +} +EXPORT_SYMBOL_GPL(irq_to_evtchn_port); + +void mask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_set_bit(port, s->evtchn_mask); +} +EXPORT_SYMBOL_GPL(mask_evtchn); + +void unmask_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned int cpu = smp_processor_id(); + vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask)); + return; + } + + synch_clear_bit(port, s->evtchn_mask); + + /* Did we miss an interrupt 'edge'? Re-fire if so. */ + if (synch_test_bit(port, s->evtchn_pending) && + !synch_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; +} +EXPORT_SYMBOL_GPL(unmask_evtchn); + +void disable_all_local_evtchn(void) +{ + unsigned i, cpu = smp_processor_id(); + shared_info_t *s = HYPERVISOR_shared_info; + + for (i = 0; i < NR_EVENT_CHANNELS; ++i) + if (cpu_from_evtchn(i) == cpu) + synch_set_bit(i, &s->evtchn_mask[0]); +} + +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int virq, irq, evtchn; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int ipi, irq, evtchn; + + for (ipi = 0; ipi < NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + + BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + + } +} + +void irq_resume(void) +{ + unsigned int cpu, irq, evtchn; + + init_evtchn_cpu_bindings(); + + if (pirq_eoi_does_unmask) { + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; + if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn)) + BUG(); + } + + /* New event-channel space is not 'live' yet. */ + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + mask_evtchn(evtchn); + + /* Check that no PIRQs are still bound. */ + for (irq = PIRQ_BASE; irq < (PIRQ_BASE + NR_PIRQS); irq++) + BUG_ON(irq_info[irq] != IRQ_UNBOUND); + + /* No IRQ <-> event-channel mappings. */ + for (irq = 0; irq < NR_IRQS; irq++) + irq_info[irq] &= ~((1U << _EVTCHN_BITS) - 1); + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + evtchn_to_irq[evtchn] = -1; + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } + +} + +#if defined(CONFIG_X86_IO_APIC) +#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE)) +#elif defined(CONFIG_X86) +#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < 16) +#else +#define identity_mapped_irq(irq) (1) +#endif + +void evtchn_register_pirq(int irq) +{ + BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS); + if (identity_mapped_irq(irq) || type_from_irq(irq) != IRQT_UNBOUND) + return; + irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, 0); + irq_desc[irq].chip = &pirq_type; +} + +int evtchn_map_pirq(int irq, int xen_pirq) +{ + if (irq < 0) { + static DEFINE_SPINLOCK(irq_alloc_lock); + + irq = PIRQ_BASE + NR_PIRQS - 1; + spin_lock(&irq_alloc_lock); + do { + if (identity_mapped_irq(irq)) + continue; + if (!index_from_irq(irq)) { + BUG_ON(type_from_irq(irq) != IRQT_UNBOUND); + irq_info[irq] = mk_irq_info(IRQT_PIRQ, + xen_pirq, 0); + break; + } + } while (--irq >= PIRQ_BASE); + spin_unlock(&irq_alloc_lock); + if (irq < PIRQ_BASE) + return -ENOSPC; + irq_desc[irq].chip = &pirq_type; + } else if (!xen_pirq) { + if (unlikely(type_from_irq(irq) != IRQT_PIRQ)) + return -EINVAL; + irq_desc[irq].chip = &no_irq_type; + irq_info[irq] = IRQ_UNBOUND; + return 0; + } else if (type_from_irq(irq) != IRQT_PIRQ + || index_from_irq(irq) != xen_pirq) { + printk(KERN_ERR "IRQ#%d is already mapped to %d:%u - " + "cannot map to PIRQ#%u\n", + irq, type_from_irq(irq), index_from_irq(irq), xen_pirq); + return -EINVAL; + } + return index_from_irq(irq) ? irq : -EINVAL; +} + +int evtchn_get_xen_pirq(int irq) +{ + if (identity_mapped_irq(irq)) + return irq; + BUG_ON(type_from_irq(irq) != IRQT_PIRQ); + return index_from_irq(irq); +} + +void __init xen_init_IRQ(void) +{ + unsigned int i; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + init_evtchn_cpu_bindings(); + + pirq_needs_eoi = alloc_bootmem_pages(sizeof(unsigned long) + * BITS_TO_LONGS(ALIGN(NR_PIRQS, PAGE_SIZE * 8))); + eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT; + if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0) + pirq_eoi_does_unmask = 1; + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* No IRQ -> event-channel mappings. */ + for (i = 0; i < NR_IRQS; i++) + irq_info[i] = IRQ_UNBOUND; + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) { + irq_bindcount[i] = 0; + + irq_desc[i].status = IRQ_DISABLED|IRQ_NOPROBE; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + irq_desc[i].chip = &dynirq_type; + } + + /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ + for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_PIRQS); i++) { + irq_bindcount[i] = 1; + + if (!identity_mapped_irq(i)) + continue; + +#ifdef RTC_IRQ + /* If not domain 0, force our RTC driver to fail its probe. */ + if (i - PIRQ_BASE == RTC_IRQ && !is_initial_xendomain()) + continue; +#endif + + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + irq_desc[i].chip = &pirq_type; + } +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/drivers/xen/core/firmware.c 2007-06-22 09:08:06.000000000 +0200 @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include