From: Mark Fasheh Date: Thu, 19 Nov 2009 15:15:38 -0800 Subject: ocfs2: allocation reservations Patch-mainline: 2.6.33? References: bnc#501563 FATE#307247 ocfs2: allocation reservations This patch improves Ocfs2 allocation policy by allowing an inode to reserve a portion of the local alloc bitmap for itself. Allocation windows are advisory in that they won't block use of that portion of the bitmap. This makes dealing with corner cases much easier - we can always fall back to previous policy. Reservation windows are represented internally by a red-black tree. Within that tree, each node represents the reservation window of one inode. When new data is written, we try to allocate from the window first. If that allocation fails, we fall back to our old heuristics and a new window is computed from the results. Allocation windows will also be extended if allocation from them succeeds. Signed-off-by: Mark Fasheh --- Documentation/filesystems/ocfs2.txt | 3 fs/ocfs2/Makefile | 1 fs/ocfs2/aops.c | 2 fs/ocfs2/cluster/masklog.c | 1 fs/ocfs2/cluster/masklog.h | 1 fs/ocfs2/dir.c | 2 fs/ocfs2/file.c | 19 + fs/ocfs2/inode.c | 4 fs/ocfs2/inode.h | 2 fs/ocfs2/localalloc.c | 39 +- fs/ocfs2/ocfs2.h | 5 fs/ocfs2/reservations.c | 668 ++++++++++++++++++++++++++++++++++++ fs/ocfs2/reservations.h | 151 ++++++++ fs/ocfs2/suballoc.c | 1 fs/ocfs2/suballoc.h | 2 fs/ocfs2/super.c | 27 + 16 files changed, 922 insertions(+), 6 deletions(-) --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -80,3 +80,6 @@ user_xattr (*) Enables Extended User Att nouser_xattr Disables Extended User Attributes. acl Enables POSIX Access Control Lists support. noacl (*) Disables POSIX Access Control Lists support. +resv_level=3 (*) Set how agressive allocation reservations will be. + Valid values are between 0 (reservations off) to 6 + (maximum space for reservations). --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -29,6 +29,7 @@ ocfs2-objs := \ mmap.o \ namei.o \ refcounttree.o \ + reservations.o \ resize.o \ slot_map.o \ suballoc.o \ --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1735,6 +1735,8 @@ int ocfs2_write_begin_nolock(struct addr goto out; } + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list, clusters_to_alloc); --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[ define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), + define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -119,6 +119,7 @@ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ +#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2991,6 +2991,7 @@ static int ocfs2_expand_inline_dir(struc * if we only get one now, that's enough to continue. The rest * will be claimed after the conversion to extents. */ + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -3368,6 +3369,7 @@ static int ocfs2_extend_dir(struct ocfs2 mlog_errno(status); goto bail; } + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; credits = ocfs2_calc_extend_credits(sb, el, 1); } else { --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -147,6 +147,7 @@ leave: static int ocfs2_file_release(struct inode *inode, struct file *file) { struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, file->f_path.dentry->d_name.len, @@ -157,6 +158,21 @@ static int ocfs2_file_release(struct ino oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; spin_unlock(&oi->ip_lock); +#if 0 + /* + * Disable this for now. Keeping the reservation around a bit + * longer gives an improvement for workloads which rapidly do + * open()/write()/close() against a file. + */ + if ((file->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) { + down_write(&oi->ip_alloc_sem); + ocfs2_resv_discard(&osb->osb_la_resmap, + &oi->ip_la_data_resv); + up_write(&oi->ip_alloc_sem); + } +#endif + ocfs2_free_file_private(inode, file); mlog_exit(0); @@ -488,6 +504,9 @@ static int ocfs2_truncate_file(struct in down_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_resv_discard(&osb->osb_la_resmap, + &OCFS2_I(inode)->ip_la_data_resv); + /* * The inode lock forced other nodes to sync and drop their * pages, which (correctly) happens even if we have a truncate --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1101,6 +1101,10 @@ void ocfs2_clear_inode(struct inode *ino ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + &oi->ip_la_data_resv); + ocfs2_resv_init_once(&oi->ip_la_data_resv); + /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -70,6 +70,8 @@ struct ocfs2_inode_info /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; + + struct ocfs2_alloc_reservation ip_la_data_resv; }; /* --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits( static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, struct ocfs2_dinode *alloc, - u32 numbits); + u32 numbits, + struct ocfs2_alloc_reservation *resv); static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); @@ -262,6 +263,8 @@ void ocfs2_shutdown_local_alloc(struct o osb->local_alloc_state = OCFS2_LA_DISABLED; + ocfs2_resmap_uninit(&osb->osb_la_resmap); + main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -498,7 +501,7 @@ static int ocfs2_local_alloc_in_range(st alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted, NULL); if (start == -1) { mlog_errno(-ENOSPC); return 0; @@ -664,7 +667,8 @@ int ocfs2_claim_local_alloc_bits(struct alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted, + ac->ac_resv); if (start == -1) { /* TODO: Shouldn't we just BUG here? */ status = -ENOSPC; @@ -687,6 +691,9 @@ int ocfs2_claim_local_alloc_bits(struct goto bail; } + ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start, + bits_wanted); + while(bits_wanted--) ocfs2_set_bit(start++, bitmap); @@ -722,11 +729,13 @@ static u32 ocfs2_local_alloc_count_bits( } static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, - struct ocfs2_dinode *alloc, - u32 numbits) + struct ocfs2_dinode *alloc, + u32 numbits, + struct ocfs2_alloc_reservation *resv) { int numfound, bitoff, left, startoff, lastzero; void *bitmap = NULL; + struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; mlog_entry("(numbits wanted = %u)\n", numbits); @@ -738,6 +747,20 @@ static int ocfs2_local_alloc_find_clear_ bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + /* + * Ask the reservations code first whether this request can be + * easily fulfilled. No errors here are fatal - if we didn't + * find the number of bits needed, we'll just take the slow + * path. + */ + if (ocfs2_resmap_resv_bits(resmap, resv, bitmap, &bitoff, &numfound) + == 0) { + if (numfound >= numbits) { + numfound = numbits; + goto bail; + } + } + numfound = bitoff = startoff = 0; lastzero = -1; left = le32_to_cpu(alloc->id1.bitmap1.i_total); @@ -772,8 +795,10 @@ static int ocfs2_local_alloc_find_clear_ if (numfound == numbits) bitoff = startoff - numfound; - else + else { + numfound = 0; bitoff = -1; + } bail: mlog_exit(bitoff); @@ -1096,6 +1121,8 @@ retry_enospc: memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, le16_to_cpu(la->la_size)); + ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count); + mlog(0, "New window allocated:\n"); mlog(0, "window la_bm_off = %u\n", OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -47,6 +47,7 @@ /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" +#include "reservations.h" /* Caching of metadata buffers */ @@ -349,6 +350,10 @@ struct ocfs2_super u64 la_last_gd; + struct ocfs2_reservation_map osb_la_resmap; + + unsigned int osb_resv_level; + /* Next three fields are for local node slot recovery during * mount. */ int dirty; --- /dev/null +++ b/fs/ocfs2/reservations.c @@ -0,0 +1,668 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.c + * + * Allocation reservations implementation + * + * Some code borrowed from fs/ext3/balloc.c and is: + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * The rest is copyright (C) 2009 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include + +#define MLOG_MASK_PREFIX ML_RESERVATIONS +#include + +#include "ocfs2.h" + +#ifdef CONFIG_OCFS2_DEBUG_FS +#define OCFS2_CHECK_RESERVATIONS +#endif + +#define OCFS2_CHECK_RESERVATIONS + + +DEFINE_SPINLOCK(resv_lock); + +#define OCFS2_MIN_RESV_WINDOW_BITS 8 +#define OCFS2_MAX_RESV_WINDOW_BITS 1024 + +static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + + mlog(0, "resv_level: %u\n", osb->osb_resv_level); + + switch (osb->osb_resv_level) { + case 6: + return OCFS2_MAX_RESV_WINDOW_BITS; + case 5: + return 512; + case 4: + return 256; + case 3: + return 128; + case 2: + return 64; + } + + return OCFS2_MIN_RESV_WINDOW_BITS; +} + +static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_len) + return resv->r_start + resv->r_len - 1; + return resv->r_start; +} + +static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) +{ + return !!(resv->r_len == 0); +} + +static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) +{ + if (resmap->m_osb->osb_resv_level == 0) + return 1; + return 0; +} + +static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + int i = 0; + + mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", + osb->dev_str, resmap->m_bitmap_len); + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" + "\tlast_len: %u\tallocated: %u\n", resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len, resv->r_allocated); + + node = rb_next(node); + i++; + } + + mlog(ML_NOTICE, "%d reservations found\n", i); +} + +#ifdef OCFS2_CHECK_RESERVATIONS +static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + unsigned int off = 0; + int i = 0; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (i > 0 && resv->r_start <= off) { + mlog(ML_ERROR, "reservation %d has bad start off!\n", + i); + goto bad; + } + + if (resv->r_len == 0) { + mlog(ML_ERROR, "reservation %d has no length!\n", + i); + goto bad; + } + + if (resv->r_start > ocfs2_resv_end(resv)) { + mlog(ML_ERROR, "reservation %d has invalid range!\n", + i); + goto bad; + } + + if (ocfs2_resv_end(resv) > resmap->m_bitmap_len) { + mlog(ML_ERROR, "reservation %d extends past bitmap!\n", + i); + goto bad; + } + + off = ocfs2_resv_end(resv); + node = rb_next(node); + + i++; + } + return; + +bad: + ocfs2_dump_resv(resmap); + BUG(); +} +#else +static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + +} +#endif + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) +{ + memset(resv, 0, sizeof(*resv)); +} + +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap) +{ + memset(resmap, 0, sizeof(*resmap)); + + resmap->m_osb = osb; + resmap->m_reservations = RB_ROOT; + /* m_bitmap_len is initialized to zero by the above memset. */ + + return 0; +} + +static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) +{ + resv->r_len = 0; + resv->r_allocated = 0; +} + +static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_inuse) { + rb_erase(&resv->r_node, &resmap->m_reservations); + resv->r_inuse = 0; + } +} + +static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + __ocfs2_resv_trunc(resv); + ocfs2_resv_remove(resmap, resv); +} + +/* does nothing if 'resv' is null */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv) { + spin_lock(&resv_lock); + __ocfs2_resv_discard(resmap, resv); + spin_unlock(&resv_lock); + } +} + +static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) +{ + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + assert_spin_locked(&resv_lock); + + while ((node = rb_last(&resmap->m_reservations)) != NULL) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + __ocfs2_resv_discard(resmap, resv); + /* + * last_len and last_start no longer make sense if + * we're changing the range of our allocations. + */ + resv->r_last_len = resv->r_last_start = 0; + } +} + +/* If any parameters have changed, this function will call + * ocfs2_resv_trunc against all existing reservations. */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + spin_lock(&resv_lock); + + ocfs2_resmap_clear_all_resv(resmap); + resmap->m_bitmap_len = clen; + + spin_unlock(&resv_lock); +} + +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) +{ + /* Does nothing for now. Keep this around for API symmetry */ +} + +/* + * Determine the number of available bits between my_resv and the next + * window and extends my_resv accordingly. + */ +static int ocfs2_try_to_extend_resv(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *my_resv) +{ + unsigned int available, avail_end; + struct rb_node *next, *node = &my_resv->r_node; + struct ocfs2_alloc_reservation *next_resv; + unsigned int bits = ocfs2_resv_window_bits(resmap); + + next = rb_next(node); + + if (next) { + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + avail_end = next_resv->r_start; + } else { + avail_end = resmap->m_bitmap_len - 1; + } + + if (ocfs2_resv_end(my_resv) == avail_end) + return -ENOENT; + + available = avail_end - ocfs2_resv_end(my_resv) - 1; + + my_resv->r_len += available; + if (my_resv->r_len > bits) + my_resv->r_len = bits; + + ocfs2_check_resmap(resmap); + + return 0; +} + +static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *new) +{ + struct rb_root *root = &resmap->m_reservations; + struct rb_node *parent = NULL; + struct rb_node **p = &root->rb_node; + struct ocfs2_alloc_reservation *tmp; + + assert_spin_locked(&resv_lock); + + mlog(0, "Insert reservation start: %u len: %u\n", new->r_start, + new->r_len); + + while(*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); + + if (new->r_start < tmp->r_start) + p = &(*p)->rb_left; + else if (new->r_start > ocfs2_resv_end(tmp)) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate reservation window!\n"); + BUG(); + } + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, root); + new->r_inuse = 1; + + ocfs2_check_resmap(resmap); +} + +/** + * ocfs2_find_resv() - find the window which contains goal + * @resmap: reservation map to search + * @goal: which bit to search for + * + * If a window containing that goal is not found, we return the window + * which comes before goal. Returns NULL on empty rbtree or no window + * before goal. + */ +static struct ocfs2_alloc_reservation * +ocfs2_find_resv(struct ocfs2_reservation_map *resmap, unsigned int goal) +{ + struct ocfs2_alloc_reservation *resv; + struct rb_node *n = resmap->m_reservations.rb_node; + + assert_spin_locked(&resv_lock); + + if (!n) + return NULL; + + do { + resv = rb_entry(n, struct ocfs2_alloc_reservation, r_node); + + if (goal < resv->r_start) + n = n->rb_left; + else if (goal > ocfs2_resv_end(resv)) + n = n->rb_right; + else + return resv; + } while (n); + + /* + * The goal sits on one end of the tree. If it's the leftmost + * end, we return NULL. + */ + if (resv->r_start > goal) + return NULL; + + return resv; +} + +static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + struct rb_root *root = &resmap->m_reservations; + unsigned int last_start = resv->r_last_start; + unsigned int goal = 0; + unsigned int len = ocfs2_resv_window_bits(resmap); + unsigned int gap_start, gap_end, gap_len; + struct ocfs2_alloc_reservation *prev_resv, *next_resv; + struct rb_node *prev, *next; + + if (resv->r_last_len) { + unsigned int last_end = last_start + resv->r_last_len - 1; + + goal = last_end + 1; + + if (goal >= resmap->m_bitmap_len) + goal = 0; + } + + /* + * Nasty cases to consider: + * + * - rbtree is empty + * - our window should be first in all reservations + * - our window should be last in all reservations + * - need to make sure we don't go past end of bitmap + */ + + assert_spin_locked(&resv_lock); + + if (RB_EMPTY_ROOT(root)) { + /* + * Easiest case - empty tree. We can just take + * whatever window we want. + */ + + mlog(0, "Empty root\n"); + + resv->r_start = goal; + resv->r_len = len; + if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) + resv->r_len = resmap->m_bitmap_len - resv->r_start; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = ocfs2_find_resv(resmap, goal); + + if (prev_resv == NULL) { + mlog(0, "Farthest left window\n"); + + /* Ok, we're the farthest left window. */ + next = rb_first(root); + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + + /* + * Try to allocate at far left of tree. If that + * doesn't fit, we just start our linear search from + * next_resv + */ + if (next_resv->r_start > (goal + len - 1)) { + resv->r_start = goal; + resv->r_len = len; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = next_resv; + next_resv = NULL; + } + + prev = &prev_resv->r_node; + + /* Now we do a linear search for a window, starting at 'prev_rsv' */ + while (1) { + next = rb_next(prev); + if (next) { + mlog(0, "One more resv found in linear search\n"); + next_resv = rb_entry(next, + struct ocfs2_alloc_reservation, + r_node); + + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = next_resv->r_start - 1; + gap_len = gap_end - gap_start + 1; + } else { + mlog(0, "No next node\n"); + /* + * We're at the rightmost edge of the + * tree. See if a reservation between this + * window and the end of the bitmap will work. + */ + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = resmap->m_bitmap_len - 1; + gap_len = gap_end - gap_start + 1; + } + + if (gap_start <= gap_end + && gap_start >= goal + && gap_len >= len) { + resv->r_start = gap_start; + resv->r_len = len; + + ocfs2_resv_insert(resmap, resv); + return; + } + + if (!next) + break; + + prev = next; + prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, + r_node); + } +} + +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen) +{ + unsigned int cend = cstart + clen - 1; + + if (resmap == NULL || ocfs2_resmap_disabled(resmap)) + return; + + if (resv == NULL) + return; + + spin_lock(&resv_lock); + + mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u " + "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n", + cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, resv->r_last_len); + + resv->r_last_len = clen; + resv->r_last_start = cstart; + + if (ocfs2_resv_empty(resv)) { + mlog(0, "Empty reservation, find a new window.\n"); + /* + * Allocation occured without a window. We find an + * initial reservation for this inode, based on what + * was allocated already. + */ + ocfs2_resv_find_window(resmap, resv); + goto out_unlock; + } + + /* + * Did the allocation occur completely outside our + * reservation? Clear it then. Otherwise, try to extend our + * reservation or alloc a new one, if we've used all the bits. + */ + if (cend < resv->r_start || + cstart > ocfs2_resv_end(resv)) { + mlog(0, "Allocated outside reservation\n"); + + /* Truncate and remove reservation */ + __ocfs2_resv_discard(resmap, resv); + + if (cend < resv->r_start) { + /* + * The window wasn't used for some reason. We + * should start our search *past* it to give a + * better chance the next window will be + * used. Best way to do this right now is to + * fool the search code... + */ + resv->r_last_start = ocfs2_resv_end(resv) + 1; + resv->r_last_len = 1; + } + + ocfs2_resv_find_window(resmap, resv); + goto out_unlock; + } + + /* + * We allocated at least partially from our + * reservation. Adjust it and try to extend. Otherwise, we + * search for a new window. + */ + + resv->r_allocated += clen; + + if (cend < ocfs2_resv_end(resv)) { + u32 old_end; + + mlog(0, "Allocation left at end\n"); + + /* + * Partial allocation, leaving some bits free at + * end. We move over the start of the window to take + * this into account and try to extend it. + */ + old_end = ocfs2_resv_end(resv); + resv->r_start = cend + 1; /* Start just past last allocation */ + resv->r_len = old_end - resv->r_start + 1; + + if (ocfs2_try_to_extend_resv(resmap, resv) == 0) + goto out_unlock; + } + + mlog(0, "discard reservation\n"); + + /* + * No free bits at end or extend failed above. Truncate and + * re-search for a new window. + */ + + __ocfs2_resv_discard(resmap, resv); + + ocfs2_resv_find_window(resmap, resv); + +out_unlock: + mlog(0, "Reservation now looks like: r_start: %u r_end: %u " + "r_len: %u r_last_start: %u r_last_len: %u\n", + resv->r_start, ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, resv->r_last_len); + + spin_unlock(&resv_lock); +} + +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + char *disk_bitmap, int *cstart, int *clen) +{ + int ret = -ENOSPC; + unsigned int start, len, best_start = 0, best_len = 0; + + if (resv == NULL || ocfs2_resmap_disabled(resmap)) + return -ENOSPC; + + spin_lock(&resv_lock); + + if (ocfs2_resv_empty(resv)) { + mlog(0, "empty reservation, find new window\n"); + + ocfs2_resv_find_window(resmap, resv); + + if (ocfs2_resv_empty(resv)) { + /* + * If resv is still empty, we return zero + * bytes and allow ocfs2_resmap_claimed_bits() + * to start our new reservation after the + * allocator has done it's work. + */ + *cstart = *clen = 0; + ret = 0; + goto out; + } + } + + start = resv->r_start; + len = 0; + + while (start <= ocfs2_resv_end(resv)) { + if (ocfs2_test_bit(start, disk_bitmap)) { + mlog(0, + "Reservation was taken at bit %d\n", + start + len); + best_len = 0; + goto next; + } + + /* This is basic, but since the local alloc is + * used very predictably, I think we're ok. */ + if (!best_len) { + best_start = start; + best_len = 1; + } else { + best_len++; + } + +next: + start++; + } + + if (best_len) { + ret = 0; + *cstart = best_start; + *clen = best_len; + } +out: + spin_unlock(&resv_lock); + + return ret; +} --- /dev/null +++ b/fs/ocfs2/reservations.h @@ -0,0 +1,151 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.h + * + * Allocation reservations function prototypes and structures. + * + * Copyright (C) 2009 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_RESERVATIONS_H +#define OCFS2_RESERVATIONS_H + +#include + +struct ocfs2_bitmap_resv_ops; + +#define OCFS2_DEFAULT_RESV_LEVEL 3 +#define OCFS2_MAX_RESV_LEVEL 7 +#define OCFS2_MIN_RESV_LEVEL 0 + +struct ocfs2_alloc_reservation { + struct rb_node r_node; + + unsigned int r_start; + unsigned int r_len; + + unsigned int r_last_len; + unsigned int r_last_start; + + unsigned int r_allocated; + + int r_inuse; +}; + +struct ocfs2_reservation_map { + struct rb_root m_reservations; + + struct ocfs2_super *m_osb; + + /* The following are not initialized to meaningful values until a disk + * bitmap is provided. */ + u32 m_bitmap_len; /* Number of valid + * bits available */ +}; + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); + +/** + * ocfs2_resv_discard() - truncate a reservation + * @resmap: + * @resv: the reservation to truncate. + * + * After this function is called, the reservation will be empty, and + * unlinked from the rbtree. + */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv); + + +/** + * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @resmap: struct ocfs2_reservation_map to initialize + * @obj: unused for now + * @ops: unused for now + * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) + * + * Only possible return value other than '0' is -ENOMEM for failure to + * allocation mirror bitmap. + */ +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_restart() - "restart" a reservation bitmap + * @resmap: reservations bitmap + * @clen: Number of valid bits in the bitmap + * + * Re-initialize the parameters of a reservation bitmap. This is + * useful for local alloc window slides. + * + * If any bitmap parameters have changed, this function will call + * ocfs2_trunc_resv against all existing reservations. A future + * version will recalculate existing reservations based on the new + * bitmap. + */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen); + +/** + * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure + * @resmap: the struct ocfs2_reservation_map to uninitialize + */ +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_resv_bits() - Return still-valid reservation bits + * @resmap: reservations bitmap + * @resv: reservation to base search from + * @disk_bitmap: up to date (from disk) allocation bitmap + * @cstart: start of proposed allocation + * @clen: length (in clusters) of proposed allocation + * + * Using the reservation data from resv, this function will compare + * resmap and disk_bitmap to determine what part (if any) of the + * reservation window is still clear to use. An empty resv passed here + * will just return no allocation. + * + * On success, zero is returned and the valid allocation area is set in cstart + * and clen. If no allocation is found, they are set to zero. + * + * Returns nonzero on error. + */ +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + char *disk_bitmap, int *cstart, int *clen); + +/** + * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. + * @resmap: reservations bitmap + * @resv: optional reservation to recalulate based on new bitmap + * @cstart: start of allocation in clusters + * @clen: end of allocation in clusters. + * + * Tell the reservation code that bits were used to fulfill allocation in + * resmap. The bits don't have to have been part of any existing + * reservation. But we must always call this function when bits are claimed. + * Internally, the reservations code will use this information to mark the + * reservations bitmap. If resv is passed, it's next allocation window will be + * calculated. + */ +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen); + +#endif /* OCFS2_RESERVATIONS_H */ --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2 } brelse(ac->ac_bh); ac->ac_bh = NULL; + ac->ac_resv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -54,6 +54,8 @@ struct ocfs2_alloc_context { u64 ac_last_group; u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + + struct ocfs2_alloc_reservation *ac_resv; }; void ocfs2_init_steal_slots(struct ocfs2_super *osb); --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -95,6 +95,7 @@ struct mount_options unsigned int atime_quantum; signed short slot; unsigned int localalloc_opt; + unsigned int resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; }; @@ -176,6 +177,7 @@ enum { Opt_noacl, Opt_usrquota, Opt_grpquota, + Opt_resv_level, Opt_err, }; @@ -202,6 +204,7 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_resv_level, "resv_level=%u"}, {Opt_err, NULL} }; @@ -1030,6 +1033,7 @@ static int ocfs2_fill_super(struct super osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; + osb->osb_resv_level = parsed_options.resv_level; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1290,6 +1294,7 @@ static int ocfs2_parse_options(struct su mopt->slot = OCFS2_INVALID_SLOT; mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; if (!options) { status = 1; @@ -1433,6 +1438,17 @@ static int ocfs2_parse_options(struct su mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; + case Opt_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1514,6 +1530,9 @@ static int ocfs2_show_options(struct seq else seq_printf(s, ",noacl"); + if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) + seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + return 0; } @@ -1688,6 +1707,8 @@ static void ocfs2_inode_init_once(void * oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + ocfs2_resv_init_once(&oi->ip_la_data_resv); + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); @@ -2042,6 +2063,12 @@ static int ocfs2_initialize_super(struct init_waitqueue_head(&osb->osb_mount_event); + status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); + if (status) { + mlog_errno(status); + goto bail; + } + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n");